X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_pipe.c;h=466546b76b731e3e87019453becd05341403417d;hb=6ff0c6f4ebcb87ea6c6fe5a4ba90b548f666067d;hp=a299ab7aff78008c10f1ccdf6266cdb3b9195682;hpb=78a4750d9155bf810348e2b2bf05ee3366f56f43;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a299ab7aff7..466546b76b7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1,5 +1,7 @@ /* * Copyright 2010 Jerome Glisse + * Copyright 2018 Advanced Micro Devices, Inc. + * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -32,11 +34,73 @@ #include "util/u_memory.h" #include "util/u_suballoc.h" #include "util/u_tests.h" +#include "util/u_upload_mgr.h" #include "util/xmlconfig.h" #include "vl/vl_decoder.h" -#include "../ddebug/dd_util.h" - -#include "compiler/nir/nir.h" +#include "driver_ddebug/dd_util.h" + +static const struct debug_named_value debug_options[] = { + /* Shader logging options: */ + { "vs", DBG(VS), "Print vertex shaders" }, + { "ps", DBG(PS), "Print pixel shaders" }, + { "gs", DBG(GS), "Print geometry shaders" }, + { "tcs", DBG(TCS), "Print tessellation control shaders" }, + { "tes", DBG(TES), "Print tessellation evaluation shaders" }, + { "cs", DBG(CS), "Print compute shaders" }, + { "noir", DBG(NO_IR), "Don't print the LLVM IR"}, + { "notgsi", DBG(NO_TGSI), "Don't print the TGSI"}, + { "noasm", DBG(NO_ASM), "Don't print disassembled shaders"}, + { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" }, + + /* Shader compiler options the shader cache should be aware of: */ + { "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" }, + { "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." }, + + /* Shader compiler options (with no effect on the shader cache): */ + { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" }, + { "nir", DBG(NIR), "Enable experimental NIR shaders" }, + { "mono", DBG(MONOLITHIC_SHADERS), "Use old-style monolithic shaders compiled on demand" }, + { "nooptvariant", DBG(NO_OPT_VARIANT), "Disable compiling optimized shader variants." }, + + /* Information logging options: */ + { "info", DBG(INFO), "Print driver information" }, + { "tex", DBG(TEX), "Print texture info" }, + { "compute", DBG(COMPUTE), "Print compute info" }, + { "vm", DBG(VM), "Print virtual addresses when creating resources" }, + + /* Driver options: */ + { "forcedma", DBG(FORCE_DMA), "Use asynchronous DMA for all operations when possible." }, + { "nodma", DBG(NO_ASYNC_DMA), "Disable asynchronous DMA" }, + { "nowc", DBG(NO_WC), "Disable GTT write combining" }, + { "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." }, + { "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." }, + + /* 3D engine options: */ + { "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." }, + { "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" }, + { "nodpbb", DBG(NO_DPBB), "Disable DPBB." }, + { "nodfsm", DBG(NO_DFSM), "Disable DFSM." }, + { "dpbb", DBG(DPBB), "Enable DPBB." }, + { "dfsm", DBG(DFSM), "Enable DFSM." }, + { "nohyperz", DBG(NO_HYPERZ), "Disable Hyper-Z" }, + { "norbplus", DBG(NO_RB_PLUS), "Disable RB+." }, + { "no2d", DBG(NO_2D_TILING), "Disable 2D tiling" }, + { "notiling", DBG(NO_TILING), "Disable tiling" }, + { "nodcc", DBG(NO_DCC), "Disable DCC." }, + { "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." }, + { "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" }, + { "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" }, + { "dccmsaa", DBG(DCC_MSAA), "Enable DCC for MSAA" }, + { "nofmask", DBG(NO_FMASK), "Disable MSAA compression" }, + + /* Tests: */ + { "testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit." }, + { "testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit." }, + { "testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit." }, + { "testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit." }, + + DEBUG_NAMED_VALUE_END /* must be last */ +}; /* * pipe_context @@ -57,8 +121,7 @@ static void si_destroy_context(struct pipe_context *context) pipe_resource_reference(&sctx->esgs_ring, NULL); pipe_resource_reference(&sctx->gsvs_ring, NULL); - pipe_resource_reference(&sctx->tf_ring, NULL); - pipe_resource_reference(&sctx->tess_offchip_ring, NULL); + pipe_resource_reference(&sctx->tess_rings, NULL); pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); r600_resource_reference(&sctx->border_color_buffer, NULL); free(sctx->border_color_table); @@ -73,32 +136,69 @@ static void si_destroy_context(struct pipe_context *context) si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); if (sctx->fixed_func_tcs_shader.cso) - sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader.cso); + sctx->b.delete_tcs_state(&sctx->b, sctx->fixed_func_tcs_shader.cso); if (sctx->custom_dsa_flush) - sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush); + sctx->b.delete_depth_stencil_alpha_state(&sctx->b, sctx->custom_dsa_flush); if (sctx->custom_blend_resolve) - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve); + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_resolve); if (sctx->custom_blend_fmask_decompress) - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fmask_decompress); + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_fmask_decompress); if (sctx->custom_blend_eliminate_fastclear) - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_eliminate_fastclear); + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_eliminate_fastclear); if (sctx->custom_blend_dcc_decompress) - sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_dcc_decompress); + sctx->b.delete_blend_state(&sctx->b, sctx->custom_blend_dcc_decompress); if (sctx->vs_blit_pos) - sctx->b.b.delete_vs_state(&sctx->b.b, sctx->vs_blit_pos); + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos); if (sctx->vs_blit_pos_layered) - sctx->b.b.delete_vs_state(&sctx->b.b, sctx->vs_blit_pos_layered); + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered); if (sctx->vs_blit_color) - sctx->b.b.delete_vs_state(&sctx->b.b, sctx->vs_blit_color); + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color); if (sctx->vs_blit_color_layered) - sctx->b.b.delete_vs_state(&sctx->b.b, sctx->vs_blit_color_layered); + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered); if (sctx->vs_blit_texcoord) - sctx->b.b.delete_vs_state(&sctx->b.b, sctx->vs_blit_texcoord); + sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord); if (sctx->blitter) util_blitter_destroy(sctx->blitter); - si_common_context_cleanup(&sctx->b); + /* Release DCC stats. */ + for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { + assert(!sctx->dcc_stats[i].query_active); + + for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++) + if (sctx->dcc_stats[i].ps_stats[j]) + sctx->b.destroy_query(&sctx->b, + sctx->dcc_stats[i].ps_stats[j]); + + r600_texture_reference(&sctx->dcc_stats[i].tex, NULL); + } + + if (sctx->query_result_shader) + sctx->b.delete_compute_state(&sctx->b, sctx->query_result_shader); + + if (sctx->gfx_cs) + sctx->ws->cs_destroy(sctx->gfx_cs); + if (sctx->dma_cs) + sctx->ws->cs_destroy(sctx->dma_cs); + if (sctx->ctx) + sctx->ws->ctx_destroy(sctx->ctx); + + if (sctx->b.stream_uploader) + u_upload_destroy(sctx->b.stream_uploader); + if (sctx->b.const_uploader) + u_upload_destroy(sctx->b.const_uploader); + if (sctx->cached_gtt_allocator) + u_upload_destroy(sctx->cached_gtt_allocator); + + slab_destroy_child(&sctx->pool_transfers); + slab_destroy_child(&sctx->pool_transfers_unsync); + + if (sctx->allocator_zeroed_memory) + u_suballocator_destroy(sctx->allocator_zeroed_memory); + + sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); + sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL); + r600_resource_reference(&sctx->eop_bug_scratch, NULL); LLVMDisposeTargetMachine(sctx->tm); @@ -120,7 +220,50 @@ si_amdgpu_get_reset_status(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; - return sctx->b.ws->ctx_query_reset_status(sctx->b.ctx); + return sctx->ws->ctx_query_reset_status(sctx->ctx); +} + +static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx) +{ + struct si_context *sctx = (struct si_context *)ctx; + unsigned latest = sctx->ws->query_value(sctx->ws, + RADEON_GPU_RESET_COUNTER); + + if (sctx->gpu_reset_counter == latest) + return PIPE_NO_RESET; + + sctx->gpu_reset_counter = latest; + return PIPE_UNKNOWN_CONTEXT_RESET; +} + +static void si_set_device_reset_callback(struct pipe_context *ctx, + const struct pipe_device_reset_callback *cb) +{ + struct si_context *sctx = (struct si_context *)ctx; + + if (cb) + sctx->device_reset_callback = *cb; + else + memset(&sctx->device_reset_callback, 0, + sizeof(sctx->device_reset_callback)); +} + +bool si_check_device_reset(struct si_context *sctx) +{ + enum pipe_reset_status status; + + if (!sctx->device_reset_callback.reset) + return false; + + if (!sctx->b.get_device_reset_status) + return false; + + status = sctx->b.get_device_reset_status(&sctx->b); + if (status == PIPE_NO_RESET) + return false; + + sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status); + return true; } /* Apitrace profiling: @@ -138,20 +281,20 @@ static void si_emit_string_marker(struct pipe_context *ctx, dd_parse_apitrace_marker(string, len, &sctx->apitrace_call_number); - if (sctx->b.log) - u_log_printf(sctx->b.log, "\nString marker: %*s\n", len, string); + if (sctx->log) + u_log_printf(sctx->log, "\nString marker: %*s\n", len, string); } static LLVMTargetMachineRef si_create_llvm_target_machine(struct si_screen *sscreen) { enum ac_target_machine_options tm_options = - (sscreen->b.debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) | - (sscreen->b.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) | - (sscreen->b.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) | + (sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) | + (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) | + (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) | (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0); - return ac_create_target_machine(sscreen->b.family, tm_options); + return ac_create_target_machine(sscreen->info.family, tm_options); } static void si_set_debug_callback(struct pipe_context *ctx, @@ -173,7 +316,7 @@ static void si_set_log_context(struct pipe_context *ctx, struct u_log_context *log) { struct si_context *sctx = (struct si_context *)ctx; - sctx->b.log = log; + sctx->log = log; if (log) u_log_add_auto_logger(log, si_auto_log_cs, sctx); @@ -184,7 +327,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, { struct si_context *sctx = CALLOC_STRUCT(si_context); struct si_screen* sscreen = (struct si_screen *)screen; - struct radeon_winsys *ws = sscreen->b.ws; + struct radeon_winsys *ws = sscreen->ws; int shader, i; if (!sctx) @@ -193,22 +336,84 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, if (flags & PIPE_CONTEXT_DEBUG) sscreen->record_llvm_ir = true; /* racy but not critical */ - sctx->b.b.screen = screen; /* this must be set first */ - sctx->b.b.priv = NULL; - sctx->b.b.destroy = si_destroy_context; - sctx->b.b.emit_string_marker = si_emit_string_marker; - sctx->b.b.set_debug_callback = si_set_debug_callback; - sctx->b.b.set_log_context = si_set_log_context; - sctx->b.set_atom_dirty = (void *)si_set_atom_dirty; + sctx->b.screen = screen; /* this must be set first */ + sctx->b.priv = NULL; + sctx->b.destroy = si_destroy_context; + sctx->b.emit_string_marker = si_emit_string_marker; + sctx->b.set_debug_callback = si_set_debug_callback; + sctx->b.set_log_context = si_set_log_context; sctx->screen = sscreen; /* Easy accessing of screen/winsys. */ sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0; - if (!si_common_context_init(&sctx->b, &sscreen->b, flags)) + slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers); + slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers); + + sctx->ws = sscreen->ws; + sctx->family = sscreen->info.family; + sctx->chip_class = sscreen->info.chip_class; + + if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 43) { + sctx->b.get_device_reset_status = si_get_reset_status; + sctx->gpu_reset_counter = + sctx->ws->query_value(sctx->ws, + RADEON_GPU_RESET_COUNTER); + } + + sctx->b.set_device_reset_callback = si_set_device_reset_callback; + + si_init_context_texture_functions(sctx); + si_init_query_functions(sctx); + + if (sctx->chip_class == CIK || + sctx->chip_class == VI || + sctx->chip_class == GFX9) { + sctx->eop_bug_scratch = (struct r600_resource*) + pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, + 16 * sscreen->info.num_render_backends); + if (!sctx->eop_bug_scratch) + goto fail; + } + + sctx->allocator_zeroed_memory = + u_suballocator_create(&sctx->b, sscreen->info.gart_page_size, + 0, PIPE_USAGE_DEFAULT, 0, true); + if (!sctx->allocator_zeroed_memory) goto fail; - if (sscreen->b.info.drm_major == 3) - sctx->b.b.get_device_reset_status = si_amdgpu_get_reset_status; + sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024, + 0, PIPE_USAGE_STREAM, + SI_RESOURCE_FLAG_READ_ONLY); + if (!sctx->b.stream_uploader) + goto fail; + sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024, + 0, PIPE_USAGE_DEFAULT, + SI_RESOURCE_FLAG_32BIT | + (sscreen->cpdma_prefetch_writes_memory ? + 0 : SI_RESOURCE_FLAG_READ_ONLY)); + if (!sctx->b.const_uploader) + goto fail; + + sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, + 0, PIPE_USAGE_STAGING, 0); + if (!sctx->cached_gtt_allocator) + goto fail; + + sctx->ctx = sctx->ws->ctx_create(sctx->ws); + if (!sctx->ctx) + goto fail; + + if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) { + sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, + (void*)si_flush_dma_cs, + sctx); + } + + if (sscreen->info.drm_major == 3) + sctx->b.get_device_reset_status = si_amdgpu_get_reset_status; + + si_init_buffer_functions(sctx); + si_init_clear_functions(sctx); si_init_blit_functions(sctx); si_init_compute_functions(sctx); si_init_cp_dma_functions(sctx); @@ -216,17 +421,16 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_init_msaa_functions(sctx); si_init_streamout_functions(sctx); - if (sscreen->b.info.has_hw_decode) { - sctx->b.b.create_video_codec = si_uvd_create_decoder; - sctx->b.b.create_video_buffer = si_video_buffer_create; + if (sscreen->info.has_hw_decode) { + sctx->b.create_video_codec = si_uvd_create_decoder; + sctx->b.create_video_buffer = si_video_buffer_create; } else { - sctx->b.b.create_video_codec = vl_create_decoder; - sctx->b.b.create_video_buffer = vl_video_buffer_create; + sctx->b.create_video_codec = vl_create_decoder; + sctx->b.create_video_buffer = vl_video_buffer_create; } - sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, - si_context_gfx_flush, sctx); - sctx->b.gfx.flush = si_context_gfx_flush; + sctx->gfx_cs = ws->cs_create(sctx->ctx, RING_GFX, + (void*)si_flush_gfx_cs, sctx); /* Border colors. */ sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * @@ -254,15 +458,15 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_init_viewport_functions(sctx); si_init_ia_multi_vgt_param_table(sctx); - if (sctx->b.chip_class >= CIK) + if (sctx->chip_class >= CIK) cik_init_sdma_functions(sctx); else si_init_dma_functions(sctx); - if (sscreen->b.debug_flags & DBG(FORCE_DMA)) - sctx->b.b.resource_copy_region = sctx->b.dma_copy; + if (sscreen->debug_flags & DBG(FORCE_DMA)) + sctx->b.resource_copy_region = sctx->dma_copy; - sctx->blitter = util_blitter_create(&sctx->b.b); + sctx->blitter = util_blitter_create(&sctx->b); if (sctx->blitter == NULL) goto fail; sctx->blitter->draw_rectangle = si_draw_rectangle; @@ -271,16 +475,16 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->sample_mask.sample_mask = 0xffff; /* these must be last */ - si_begin_new_cs(sctx); + si_begin_new_gfx_cs(sctx); - if (sctx->b.chip_class >= GFX9) { + if (sctx->chip_class >= GFX9) { sctx->wait_mem_scratch = (struct r600_resource*) pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4); if (!sctx->wait_mem_scratch) goto fail; /* Initialize the memory. */ - struct radeon_winsys_cs *cs = sctx->b.gfx.cs; + struct radeon_winsys_cs *cs = sctx->gfx_cs; radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | S_370_WR_CONFIRM(1) | @@ -292,19 +496,19 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, /* CIK cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */ - if (sctx->b.chip_class == CIK) { + if (sctx->chip_class == CIK) { sctx->null_const_buf.buffer = si_aligned_buffer_create(screen, - R600_RESOURCE_FLAG_UNMAPPABLE, + SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, 16, - sctx->screen->b.info.tcc_cache_line_size); + sctx->screen->info.tcc_cache_line_size); if (!sctx->null_const_buf.buffer) goto fail; sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0; for (shader = 0; shader < SI_NUM_SHADERS; shader++) { for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) { - sctx->b.b.set_constant_buffer(&sctx->b.b, shader, i, + sctx->b.set_constant_buffer(&sctx->b, shader, i, &sctx->null_const_buf); } } @@ -321,9 +525,9 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, &sctx->null_const_buf); /* Clear the NULL constant buffer, because loads should return zeros. */ - sctx->b.clear_buffer(&sctx->b.b, sctx->null_const_buf.buffer, 0, - sctx->null_const_buf.buffer->width0, 0, - R600_COHERENCY_SHADER); + si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, + sctx->null_const_buf.buffer->width0, 0, + SI_COHERENCY_SHADER); } uint64_t max_threads_per_block; @@ -343,7 +547,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, * bring much benefit, but they still occupy chip resources (think * async compute). I've seen ~2% performance difference between 4 and 32. */ - sctx->scratch_waves = MAX2(32 * sscreen->b.info.num_good_compute_units, + sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64); sctx->tm = si_create_llvm_target_machine(sscreen); @@ -360,10 +564,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL); util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL); - return &sctx->b.b; + return &sctx->b; fail: fprintf(stderr, "radeonsi: Failed to create a context.\n"); - si_destroy_context(&sctx->b.b); + si_destroy_context(&sctx->b); return NULL; } @@ -373,7 +577,7 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, struct si_screen *sscreen = (struct si_screen *)screen; struct pipe_context *ctx; - if (sscreen->b.debug_flags & DBG(CHECK_VM)) + if (sscreen->debug_flags & DBG(CHECK_VM)) flags |= PIPE_CONTEXT_DEBUG; ctx = si_create_context(screen, flags); @@ -381,457 +585,26 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, if (!(flags & PIPE_CONTEXT_PREFER_THREADED)) return ctx; - /* Clover (compute-only) is unsupported. - * - * Since the threaded context creates shader states from the non-driver - * thread, asynchronous compilation is required for create_{shader}_- - * state not to use pipe_context. Debug contexts (ddebug) disable - * asynchronous compilation, so don't use the threaded context with - * those. - */ - if (flags & (PIPE_CONTEXT_COMPUTE_ONLY | PIPE_CONTEXT_DEBUG)) + /* Clover (compute-only) is unsupported. */ + if (flags & PIPE_CONTEXT_COMPUTE_ONLY) return ctx; /* When shaders are logged to stderr, asynchronous compilation is * disabled too. */ - if (sscreen->b.debug_flags & DBG_ALL_SHADERS) + if (sscreen->debug_flags & DBG_ALL_SHADERS) return ctx; - return threaded_context_create(ctx, &sscreen->b.pool_transfers, + /* Use asynchronous flushes only on amdgpu, since the radeon + * implementation for fence_server_sync is incomplete. */ + return threaded_context_create(ctx, &sscreen->pool_transfers, si_replace_buffer_storage, - &((struct si_context*)ctx)->b.tc); + sscreen->info.drm_major >= 3 ? si_create_fence : NULL, + &((struct si_context*)ctx)->tc); } /* * pipe_screen */ -static bool si_have_tgsi_compute(struct si_screen *sscreen) -{ - /* Old kernels disallowed some register writes for SI - * that are used for indirect dispatches. */ - return (sscreen->b.chip_class >= CIK || - sscreen->b.info.drm_major == 3 || - (sscreen->b.info.drm_major == 2 && - sscreen->b.info.drm_minor >= 45)); -} - -static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) -{ - struct si_screen *sscreen = (struct si_screen *)pscreen; - - switch (param) { - /* Supported features (boolean caps). */ - case PIPE_CAP_ACCELERATED: - case PIPE_CAP_TWO_SIDED_STENCIL: - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - case PIPE_CAP_ANISOTROPIC_FILTER: - case PIPE_CAP_POINT_SPRITE: - case PIPE_CAP_OCCLUSION_QUERY: - case PIPE_CAP_TEXTURE_SHADOW_MAP: - case PIPE_CAP_TEXTURE_MIRROR_CLAMP: - case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_DEPTH_CLIP_DISABLE: - case PIPE_CAP_SHADER_STENCIL_EXPORT: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - case PIPE_CAP_SM3: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_PRIMITIVE_RESTART: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_TEXTURE_BARRIER: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: - case PIPE_CAP_USER_CONSTANT_BUFFERS: - case PIPE_CAP_START_INSTANCE: - case PIPE_CAP_NPOT_TEXTURES: - case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: - case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: - case PIPE_CAP_VERTEX_COLOR_CLAMPED: - case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: - case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_COMPUTE: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT: - case PIPE_CAP_QUERY_PIPELINE_STATISTICS: - case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_SAMPLE_SHADING: - case PIPE_CAP_DRAW_INDIRECT: - case PIPE_CAP_CLIP_HALFZ: - case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION: - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_TGSI_TEXCOORD: - case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_DEPTH_BOUNDS_TEST: - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_TEXTURE_QUERY_LOD: - case PIPE_CAP_TEXTURE_GATHER_SM5: - case PIPE_CAP_TGSI_TXQS: - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_INVALIDATE_BUFFER: - case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: - case PIPE_CAP_QUERY_MEMORY_INFO: - case PIPE_CAP_TGSI_PACK_HALF_FLOAT: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: - case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_POLYGON_OFFSET_UNITS_UNSCALED: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_CLEAR_TEXTURE: - case PIPE_CAP_CULL_DISTANCE: - case PIPE_CAP_TGSI_ARRAY_COMPONENTS: - case PIPE_CAP_TGSI_CAN_READ_OUTPUTS: - case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_DOUBLES: - case PIPE_CAP_TGSI_TEX_TXF_LZ: - case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT: - case PIPE_CAP_BINDLESS_TEXTURE: - case PIPE_CAP_QUERY_TIMESTAMP: - case PIPE_CAP_QUERY_TIME_ELAPSED: - case PIPE_CAP_NIR_SAMPLERS_AS_DEREF: - case PIPE_CAP_QUERY_SO_OVERFLOW: - case PIPE_CAP_MEMOBJ: - case PIPE_CAP_LOAD_CONSTBUF: - case PIPE_CAP_INT64: - case PIPE_CAP_INT64_DIVMOD: - case PIPE_CAP_TGSI_CLOCK: - case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: - case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: - case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS: - case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET: - return 1; - - case PIPE_CAP_TGSI_VOTE: - return HAVE_LLVM >= 0x0400; - - case PIPE_CAP_TGSI_BALLOT: - return HAVE_LLVM >= 0x0500; - - case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: - return !SI_BIG_ENDIAN && sscreen->b.info.has_userptr; - - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - return (sscreen->b.info.drm_major == 2 && - sscreen->b.info.drm_minor >= 43) || - sscreen->b.info.drm_major == 3; - - case PIPE_CAP_TEXTURE_MULTISAMPLE: - /* 2D tiling on CIK is supported since DRM 2.35.0 */ - return sscreen->b.chip_class < CIK || - (sscreen->b.info.drm_major == 2 && - sscreen->b.info.drm_minor >= 35) || - sscreen->b.info.drm_major == 3; - - case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: - return R600_MAP_BUFFER_ALIGNMENT; - - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - case PIPE_CAP_MAX_VERTEX_STREAMS: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - return 4; - - case PIPE_CAP_GLSL_FEATURE_LEVEL: - if (sscreen->b.debug_flags & DBG(NIR)) - return 140; /* no geometry and tessellation shaders yet */ - if (si_have_tgsi_compute(sscreen)) - return 450; - return 420; - - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: - return MIN2(sscreen->b.info.max_alloc_size, INT_MAX); - - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - /* SI doesn't support unaligned loads. - * CIK needs DRM 2.50.0 on radeon. */ - return sscreen->b.chip_class == SI || - (sscreen->b.info.drm_major == 2 && - sscreen->b.info.drm_minor < 50); - - case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE: - /* TODO: GFX9 hangs. */ - if (sscreen->b.chip_class >= GFX9) - return 0; - /* Disable on SI due to VM faults in CP DMA. Enable once these - * faults are mitigated in software. - */ - if (sscreen->b.chip_class >= CIK && - sscreen->b.info.drm_major == 3 && - sscreen->b.info.drm_minor >= 13) - return RADEON_SPARSE_PAGE_SIZE; - return 0; - - /* Unsupported features. */ - case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: - case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: - case PIPE_CAP_USER_VERTEX_BUFFERS: - case PIPE_CAP_FAKE_SW_MSAA: - case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_VERTEXID_NOBASE: - case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES: - case PIPE_CAP_MAX_WINDOW_RECTANGLES: - case PIPE_CAP_TGSI_FS_FBFETCH: - case PIPE_CAP_TGSI_MUL_ZERO_WINS: - case PIPE_CAP_UMA: - case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE: - case PIPE_CAP_POST_DEPTH_COVERAGE: - case PIPE_CAP_TILE_RASTER_ORDER: - case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES: - return 0; - - case PIPE_CAP_NATIVE_FENCE_FD: - return sscreen->b.info.has_sync_file; - - case PIPE_CAP_QUERY_BUFFER_OBJECT: - return si_have_tgsi_compute(sscreen); - - case PIPE_CAP_DRAW_PARAMETERS: - case PIPE_CAP_MULTI_DRAW_INDIRECT: - case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - return sscreen->has_draw_indirect_multi; - - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - return 30; - - case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: - return sscreen->b.chip_class <= VI ? - PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; - - /* Stream output. */ - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - return 32*4; - - /* Geometry shader output. */ - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - return 1024; - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - return 4095; - - case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: - return 2048; - - /* Texturing. */ - case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: - case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - return 15; /* 16384 */ - case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - /* textures support 8192, but layered rendering supports 2048 */ - return 12; - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - /* textures support 8192, but layered rendering supports 2048 */ - return 2048; - - /* Viewports and render targets. */ - case PIPE_CAP_MAX_VIEWPORTS: - return SI_MAX_VIEWPORTS; - case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS: - case PIPE_CAP_MAX_RENDER_TARGETS: - return 8; - - case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MIN_TEXEL_OFFSET: - return -32; - - case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET: - case PIPE_CAP_MAX_TEXEL_OFFSET: - return 31; - - case PIPE_CAP_ENDIANNESS: - return PIPE_ENDIAN_LITTLE; - - case PIPE_CAP_VENDOR_ID: - return ATI_VENDOR_ID; - case PIPE_CAP_DEVICE_ID: - return sscreen->b.info.pci_id; - case PIPE_CAP_VIDEO_MEMORY: - return sscreen->b.info.vram_size >> 20; - case PIPE_CAP_PCI_GROUP: - return sscreen->b.info.pci_domain; - case PIPE_CAP_PCI_BUS: - return sscreen->b.info.pci_bus; - case PIPE_CAP_PCI_DEVICE: - return sscreen->b.info.pci_dev; - case PIPE_CAP_PCI_FUNCTION: - return sscreen->b.info.pci_func; - } - return 0; -} - -static int si_get_shader_param(struct pipe_screen* pscreen, - enum pipe_shader_type shader, - enum pipe_shader_cap param) -{ - struct si_screen *sscreen = (struct si_screen *)pscreen; - - switch(shader) - { - case PIPE_SHADER_FRAGMENT: - case PIPE_SHADER_VERTEX: - case PIPE_SHADER_GEOMETRY: - case PIPE_SHADER_TESS_CTRL: - case PIPE_SHADER_TESS_EVAL: - break; - case PIPE_SHADER_COMPUTE: - switch (param) { - case PIPE_SHADER_CAP_PREFERRED_IR: - return PIPE_SHADER_IR_NATIVE; - - case PIPE_SHADER_CAP_SUPPORTED_IRS: { - int ir = 1 << PIPE_SHADER_IR_NATIVE; - - if (si_have_tgsi_compute(sscreen)) - ir |= 1 << PIPE_SHADER_IR_TGSI; - - return ir; - } - - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: { - uint64_t max_const_buffer_size; - pscreen->get_compute_param(pscreen, PIPE_SHADER_IR_TGSI, - PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE, - &max_const_buffer_size); - return MIN2(max_const_buffer_size, INT_MAX); - } - default: - /* If compute shaders don't require a special value - * for this cap, we can return the same value we - * do for other shader types. */ - break; - } - break; - default: - return 0; - } - - switch (param) { - /* Shader limits. */ - case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: - case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: - return 16384; - case PIPE_SHADER_CAP_MAX_INPUTS: - return shader == PIPE_SHADER_VERTEX ? SI_MAX_ATTRIBS : 32; - case PIPE_SHADER_CAP_MAX_OUTPUTS: - return shader == PIPE_SHADER_FRAGMENT ? 8 : 32; - case PIPE_SHADER_CAP_MAX_TEMPS: - return 256; /* Max native temporaries. */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: - return 4096 * sizeof(float[4]); /* actually only memory limits this */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - return SI_NUM_CONST_BUFFERS; - case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: - case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return SI_NUM_SAMPLERS; - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - return SI_NUM_SHADER_BUFFERS; - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - return SI_NUM_IMAGES; - case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: - return 32; - case PIPE_SHADER_CAP_PREFERRED_IR: - if (sscreen->b.debug_flags & DBG(NIR) && - (shader == PIPE_SHADER_VERTEX || - shader == PIPE_SHADER_FRAGMENT)) - return PIPE_SHADER_IR_NIR; - return PIPE_SHADER_IR_TGSI; - case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: - return 4; - - /* Supported boolean features. */ - case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - case PIPE_SHADER_CAP_INTEGERS: - case PIPE_SHADER_CAP_INT64_ATOMICS: - case PIPE_SHADER_CAP_FP16: - case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: - case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: - case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: - return 1; - - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: - /* TODO: Indirect indexing of GS inputs is unimplemented. */ - return shader != PIPE_SHADER_GEOMETRY && - (sscreen->llvm_has_working_vgpr_indexing || - /* TCS and TES load inputs directly from LDS or - * offchip memory, so indirect indexing is trivial. */ - shader == PIPE_SHADER_TESS_CTRL || - shader == PIPE_SHADER_TESS_EVAL); - - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: - return sscreen->llvm_has_working_vgpr_indexing || - /* TCS stores outputs directly to memory. */ - shader == PIPE_SHADER_TESS_CTRL; - - /* Unsupported boolean features. */ - case PIPE_SHADER_CAP_SUBROUTINES: - case PIPE_SHADER_CAP_SUPPORTED_IRS: - return 0; - } - return 0; -} - -static const struct nir_shader_compiler_options nir_options = { - .vertex_id_zero_based = true, - .lower_scmp = true, - .lower_flrp32 = true, - .lower_fsat = true, - .lower_fdiv = true, - .lower_sub = true, - .lower_ffma = true, - .lower_pack_snorm_2x16 = true, - .lower_pack_snorm_4x8 = true, - .lower_pack_unorm_2x16 = true, - .lower_pack_unorm_4x8 = true, - .lower_unpack_snorm_2x16 = true, - .lower_unpack_snorm_4x8 = true, - .lower_unpack_unorm_2x16 = true, - .lower_unpack_unorm_4x8 = true, - .lower_extract_byte = true, - .lower_extract_word = true, - .max_unroll_iterations = 32, - .native_integers = true, -}; - -static const void * -si_get_compiler_options(struct pipe_screen *screen, - enum pipe_shader_ir ir, - enum pipe_shader_type shader) -{ - assert(ir == PIPE_SHADER_IR_NIR); - return &nir_options; -} - static void si_destroy_screen(struct pipe_screen* pscreen) { struct si_screen *sscreen = (struct si_screen *)pscreen; @@ -844,7 +617,7 @@ static void si_destroy_screen(struct pipe_screen* pscreen) }; unsigned i; - if (!sscreen->b.ws->unref(sscreen->b.ws)) + if (!sscreen->ws->unref(sscreen->ws)) return; util_queue_destroy(&sscreen->shader_compiler_queue); @@ -864,22 +637,34 @@ static void si_destroy_screen(struct pipe_screen* pscreen) struct si_shader_part *part = parts[i]; parts[i] = part->next; - si_radeon_shader_binary_clean(&part->binary); + ac_shader_binary_clean(&part->binary); FREE(part); } } mtx_destroy(&sscreen->shader_parts_mutex); si_destroy_shader_cache(sscreen); - si_destroy_common_screen(&sscreen->b); + + si_perfcounters_destroy(sscreen); + si_gpu_load_kill_thread(sscreen); + + mtx_destroy(&sscreen->gpu_load_mutex); + mtx_destroy(&sscreen->aux_context_lock); + sscreen->aux_context->destroy(sscreen->aux_context); + + slab_destroy_parent(&sscreen->pool_transfers); + + disk_cache_destroy(sscreen->disk_shader_cache); + sscreen->ws->destroy(sscreen->ws); + FREE(sscreen); } static bool si_init_gs_info(struct si_screen *sscreen) { /* gs_table_depth is not used by GFX9 */ - if (sscreen->b.chip_class >= GFX9) + if (sscreen->info.chip_class >= GFX9) return true; - switch (sscreen->b.family) { + switch (sscreen->info.family) { case CHIP_OLAND: case CHIP_HAINAN: case CHIP_KAVERI: @@ -918,16 +703,16 @@ static void si_handle_env_var_force_family(struct si_screen *sscreen) for (i = CHIP_TAHITI; i < CHIP_LAST; i++) { if (!strcmp(family, ac_get_llvm_processor_name(i))) { /* Override family and chip_class. */ - sscreen->b.family = sscreen->b.info.family = i; + sscreen->info.family = i; if (i >= CHIP_VEGA10) - sscreen->b.chip_class = sscreen->b.info.chip_class = GFX9; + sscreen->info.chip_class = GFX9; else if (i >= CHIP_TONGA) - sscreen->b.chip_class = sscreen->b.info.chip_class = VI; + sscreen->info.chip_class = VI; else if (i >= CHIP_BONAIRE) - sscreen->b.chip_class = sscreen->b.info.chip_class = CIK; + sscreen->info.chip_class = CIK; else - sscreen->b.chip_class = sscreen->b.info.chip_class = SI; + sscreen->info.chip_class = SI; /* Don't submit any IBs. */ setenv("RADEON_NOOP", "1", 1); @@ -941,10 +726,10 @@ static void si_handle_env_var_force_family(struct si_screen *sscreen) static void si_test_vmfault(struct si_screen *sscreen) { - struct pipe_context *ctx = sscreen->b.aux_context; + struct pipe_context *ctx = sscreen->aux_context; struct si_context *sctx = (struct si_context *)ctx; struct pipe_resource *buf = - pipe_buffer_create(&sscreen->b.b, 0, PIPE_USAGE_DEFAULT, 64); + pipe_buffer_create_const0(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 64); if (!buf) { puts("Buffer allocation failed."); @@ -953,33 +738,64 @@ static void si_test_vmfault(struct si_screen *sscreen) r600_resource(buf)->gpu_address = 0; /* cause a VM fault */ - if (sscreen->b.debug_flags & DBG(TEST_VMFAULT_CP)) { + if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) { si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0); ctx->flush(ctx, NULL, 0); puts("VM fault test: CP - done."); } - if (sscreen->b.debug_flags & DBG(TEST_VMFAULT_SDMA)) { - sctx->b.dma_clear_buffer(ctx, buf, 0, 4, 0); + if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) { + sctx->dma_clear_buffer(sctx, buf, 0, 4, 0); ctx->flush(ctx, NULL, 0); puts("VM fault test: SDMA - done."); } - if (sscreen->b.debug_flags & DBG(TEST_VMFAULT_SHADER)) { + if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) { util_test_constant_buffer(ctx, buf); puts("VM fault test: Shader - done."); } exit(0); } -static void radeonsi_get_driver_uuid(struct pipe_screen *pscreen, char *uuid) +static void si_disk_cache_create(struct si_screen *sscreen) { - ac_compute_driver_uuid(uuid, PIPE_UUID_SIZE); -} + /* Don't use the cache if shader dumping is enabled. */ + if (sscreen->debug_flags & DBG_ALL_SHADERS) + return; -static void radeonsi_get_device_uuid(struct pipe_screen *pscreen, char *uuid) -{ - struct r600_common_screen *rscreen = (struct r600_common_screen *)pscreen; + uint32_t mesa_timestamp; + if (disk_cache_get_function_timestamp(si_disk_cache_create, + &mesa_timestamp)) { + char *timestamp_str; + int res = -1; + uint32_t llvm_timestamp; + + if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo, + &llvm_timestamp)) { + res = asprintf(×tamp_str, "%u_%u", + mesa_timestamp, llvm_timestamp); + } - ac_compute_device_uuid(&rscreen->info, uuid, PIPE_UUID_SIZE); + if (res != -1) { + /* These flags affect shader compilation. */ + #define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) | \ + DBG(SI_SCHED) | \ + DBG(UNSAFE_MATH) | \ + DBG(NIR)) + uint64_t shader_debug_flags = sscreen->debug_flags & + ALL_FLAGS; + + /* Add the high bits of 32-bit addresses, which affects + * how 32-bit addresses are expanded to 64 bits. + */ + STATIC_ASSERT(ALL_FLAGS <= UINT_MAX); + shader_debug_flags |= (uint64_t)sscreen->info.address32_hi << 32; + + sscreen->disk_shader_cache = + disk_cache_create(si_get_family_name(sscreen), + timestamp_str, + shader_debug_flags); + free(timestamp_str); + } + } } struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, @@ -992,35 +808,57 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, return NULL; } + sscreen->ws = ws; + ws->query_info(ws, &sscreen->info); + + sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", + debug_options, 0); + /* Set functions first. */ - sscreen->b.b.context_create = si_pipe_create_context; - sscreen->b.b.destroy = si_destroy_screen; - sscreen->b.b.get_param = si_get_param; - sscreen->b.b.get_shader_param = si_get_shader_param; - sscreen->b.b.get_compiler_options = si_get_compiler_options; - sscreen->b.b.get_device_uuid = radeonsi_get_device_uuid; - sscreen->b.b.get_driver_uuid = radeonsi_get_driver_uuid; - sscreen->b.b.resource_create = si_resource_create_common; + sscreen->b.context_create = si_pipe_create_context; + sscreen->b.destroy = si_destroy_screen; + si_init_screen_get_functions(sscreen); + si_init_screen_buffer_functions(sscreen); si_init_screen_fence_functions(sscreen); si_init_screen_state_functions(sscreen); + si_init_screen_texture_functions(sscreen); + si_init_screen_query_functions(sscreen); /* Set these flags in debug_flags early, so that the shader cache takes * them into account. */ if (driQueryOptionb(config->options, "glsl_correct_derivatives_after_discard")) - sscreen->b.debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL); + sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL); if (driQueryOptionb(config->options, "radeonsi_enable_sisched")) - sscreen->b.debug_flags |= DBG(SI_SCHED); + sscreen->debug_flags |= DBG(SI_SCHED); - if (!si_common_screen_init(&sscreen->b, ws) || - !si_init_gs_info(sscreen) || + + if (sscreen->debug_flags & DBG(INFO)) + ac_print_gpu_info(&sscreen->info); + + slab_create_parent(&sscreen->pool_transfers, + sizeof(struct r600_transfer), 64); + + sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); + if (sscreen->force_aniso >= 0) { + printf("radeonsi: Forcing anisotropy filter to %ix\n", + /* round down to a power of two */ + 1 << util_logbase2(sscreen->force_aniso)); + } + + (void) mtx_init(&sscreen->aux_context_lock, mtx_plain); + (void) mtx_init(&sscreen->gpu_load_mutex, mtx_plain); + + if (!si_init_gs_info(sscreen) || !si_init_shader_cache(sscreen)) { FREE(sscreen); return NULL; } + si_disk_cache_create(sscreen); + /* Only enable as many threads as we have target machines, but at most * the number of CPUs - 1 if there is more than one. */ @@ -1053,100 +891,145 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false)) si_init_perfcounters(sscreen); + /* Determine tessellation ring info. */ + bool double_offchip_buffers = sscreen->info.chip_class >= CIK && + sscreen->info.family != CHIP_CARRIZO && + sscreen->info.family != CHIP_STONEY; + /* This must be one less than the maximum number due to a hw limitation. + * Various hardware bugs in SI, CIK, and GFX9 need this. + */ + unsigned max_offchip_buffers_per_se; + + /* Only certain chips can use the maximum value. */ + if (sscreen->info.family == CHIP_VEGA12) + max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64; + else + max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63; + + unsigned max_offchip_buffers = max_offchip_buffers_per_se * + sscreen->info.max_se; + unsigned offchip_granularity; + /* Hawaii has a bug with offchip buffers > 256 that can be worked * around by setting 4K granularity. */ - sscreen->tess_offchip_block_dw_size = - sscreen->b.family == CHIP_HAWAII ? 4096 : 8192; + if (sscreen->info.family == CHIP_HAWAII) { + sscreen->tess_offchip_block_dw_size = 4096; + offchip_granularity = V_03093C_X_4K_DWORDS; + } else { + sscreen->tess_offchip_block_dw_size = 8192; + offchip_granularity = V_03093C_X_8K_DWORDS; + } + + sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se; + assert(((sscreen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0); + sscreen->tess_offchip_ring_size = max_offchip_buffers * + sscreen->tess_offchip_block_dw_size * 4; + + if (sscreen->info.chip_class >= CIK) { + if (sscreen->info.chip_class >= VI) + --max_offchip_buffers; + sscreen->vgt_hs_offchip_param = + S_03093C_OFFCHIP_BUFFERING(max_offchip_buffers) | + S_03093C_OFFCHIP_GRANULARITY(offchip_granularity); + } else { + assert(offchip_granularity == V_03093C_X_8K_DWORDS); + sscreen->vgt_hs_offchip_param = + S_0089B0_OFFCHIP_BUFFERING(max_offchip_buffers); + } /* The mere presense of CLEAR_STATE in the IB causes random GPU hangs * on SI. */ - sscreen->has_clear_state = sscreen->b.chip_class >= CIK; + sscreen->has_clear_state = sscreen->info.chip_class >= CIK; sscreen->has_distributed_tess = - sscreen->b.chip_class >= VI && - sscreen->b.info.max_se >= 2; + sscreen->info.chip_class >= VI && + sscreen->info.max_se >= 2; sscreen->has_draw_indirect_multi = - (sscreen->b.family >= CHIP_POLARIS10) || - (sscreen->b.chip_class == VI && - sscreen->b.info.pfp_fw_version >= 121 && - sscreen->b.info.me_fw_version >= 87) || - (sscreen->b.chip_class == CIK && - sscreen->b.info.pfp_fw_version >= 211 && - sscreen->b.info.me_fw_version >= 173) || - (sscreen->b.chip_class == SI && - sscreen->b.info.pfp_fw_version >= 79 && - sscreen->b.info.me_fw_version >= 142); - - sscreen->has_out_of_order_rast = sscreen->b.chip_class >= VI && - sscreen->b.info.max_se >= 2 && - !(sscreen->b.debug_flags & DBG(NO_OUT_OF_ORDER)); + (sscreen->info.family >= CHIP_POLARIS10) || + (sscreen->info.chip_class == VI && + sscreen->info.pfp_fw_version >= 121 && + sscreen->info.me_fw_version >= 87) || + (sscreen->info.chip_class == CIK && + sscreen->info.pfp_fw_version >= 211 && + sscreen->info.me_fw_version >= 173) || + (sscreen->info.chip_class == SI && + sscreen->info.pfp_fw_version >= 79 && + sscreen->info.me_fw_version >= 142); + + sscreen->has_out_of_order_rast = sscreen->info.chip_class >= VI && + sscreen->info.max_se >= 2 && + !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER)); sscreen->assume_no_z_fights = driQueryOptionb(config->options, "radeonsi_assume_no_z_fights"); sscreen->commutative_blend_add = driQueryOptionb(config->options, "radeonsi_commutative_blend_add"); sscreen->clear_db_cache_before_clear = driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear"); - sscreen->has_msaa_sample_loc_bug = (sscreen->b.family >= CHIP_POLARIS10 && - sscreen->b.family <= CHIP_POLARIS12) || - sscreen->b.family == CHIP_VEGA10 || - sscreen->b.family == CHIP_RAVEN; - sscreen->has_ls_vgpr_init_bug = sscreen->b.family == CHIP_VEGA10 || - sscreen->b.family == CHIP_RAVEN; - - if (sscreen->b.debug_flags & DBG(DPBB)) { + sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 && + sscreen->info.family <= CHIP_POLARIS12) || + sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_RAVEN; + sscreen->has_ls_vgpr_init_bug = sscreen->info.family == CHIP_VEGA10 || + sscreen->info.family == CHIP_RAVEN; + + if (sscreen->debug_flags & DBG(DPBB)) { sscreen->dpbb_allowed = true; } else { /* Only enable primitive binning on Raven by default. */ - sscreen->dpbb_allowed = sscreen->b.family == CHIP_RAVEN && - !(sscreen->b.debug_flags & DBG(NO_DPBB)); + /* TODO: Investigate if binning is profitable on Vega12. */ + sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN && + !(sscreen->debug_flags & DBG(NO_DPBB)); } - if (sscreen->b.debug_flags & DBG(DFSM)) { + if (sscreen->debug_flags & DBG(DFSM)) { sscreen->dfsm_allowed = sscreen->dpbb_allowed; } else { sscreen->dfsm_allowed = sscreen->dpbb_allowed && - !(sscreen->b.debug_flags & DBG(NO_DFSM)); + !(sscreen->debug_flags & DBG(NO_DFSM)); } /* While it would be nice not to have this flag, we are constrained * by the reality that LLVM 5.0 doesn't have working VGPR indexing * on GFX9. */ - sscreen->llvm_has_working_vgpr_indexing = sscreen->b.chip_class <= VI; - - sscreen->b.has_cp_dma = true; - sscreen->b.has_streamout = true; + sscreen->llvm_has_working_vgpr_indexing = sscreen->info.chip_class <= VI; /* Some chips have RB+ registers, but don't support RB+. Those must * always disable it. */ - if (sscreen->b.family == CHIP_STONEY || - sscreen->b.chip_class >= GFX9) { - sscreen->b.has_rbplus = true; - - sscreen->b.rbplus_allowed = - !(sscreen->b.debug_flags & DBG(NO_RB_PLUS)) && - (sscreen->b.family == CHIP_STONEY || - sscreen->b.family == CHIP_RAVEN); + if (sscreen->info.family == CHIP_STONEY || + sscreen->info.chip_class >= GFX9) { + sscreen->has_rbplus = true; + + sscreen->rbplus_allowed = + !(sscreen->debug_flags & DBG(NO_RB_PLUS)) && + (sscreen->info.family == CHIP_STONEY || + sscreen->info.family == CHIP_VEGA12 || + sscreen->info.family == CHIP_RAVEN); } + sscreen->dcc_msaa_allowed = + !(sscreen->debug_flags & DBG(NO_DCC_MSAA)) && + (sscreen->debug_flags & DBG(DCC_MSAA) || + sscreen->info.chip_class == VI); + + sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= VI; + (void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain); sscreen->use_monolithic_shaders = - (sscreen->b.debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; + (sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0; - sscreen->b.barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 | + sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_INV_VMEM_L1; - if (sscreen->b.chip_class <= VI) { - sscreen->b.barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2; - sscreen->b.barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; + if (sscreen->info.chip_class <= VI) { + sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2; + sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } - sscreen->b.barrier_flags.compute_to_L2 = SI_CONTEXT_CS_PARTIAL_FLUSH; - if (debug_get_bool_option("RADEON_DUMP_SHADERS", false)) - sscreen->b.debug_flags |= DBG_ALL_SHADERS; + sscreen->debug_flags |= DBG_ALL_SHADERS; for (i = 0; i < num_compiler_threads; i++) sscreen->tm[i] = si_create_llvm_target_machine(sscreen); @@ -1154,15 +1037,15 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen); /* Create the auxiliary context. This must be done last. */ - sscreen->b.aux_context = si_create_context(&sscreen->b.b, 0); + sscreen->aux_context = si_create_context(&sscreen->b, 0); - if (sscreen->b.debug_flags & DBG(TEST_DMA)) - si_test_dma(&sscreen->b); + if (sscreen->debug_flags & DBG(TEST_DMA)) + si_test_dma(sscreen); - if (sscreen->b.debug_flags & (DBG(TEST_VMFAULT_CP) | + if (sscreen->debug_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER))) si_test_vmfault(sscreen); - return &sscreen->b.b; + return &sscreen->b; }