- struct si_screen* sscreen = (struct si_screen *)screen;
-
- /* Don't create a context if it's not compute-only and hw is compute-only. */
- if (!sscreen->info.has_graphics &&
- !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
- return NULL;
-
- struct si_context *sctx = CALLOC_STRUCT(si_context);
- struct radeon_winsys *ws = sscreen->ws;
- int shader, i;
- bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
-
- if (!sctx)
- return NULL;
-
- sctx->has_graphics = sscreen->info.chip_class == GFX6 ||
- !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
-
- if (flags & PIPE_CONTEXT_DEBUG)
- sscreen->record_llvm_ir = true; /* racy but not critical */
-
- sctx->b.screen = screen; /* this must be set first */
- sctx->b.priv = NULL;
- sctx->b.destroy = si_destroy_context;
- sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
- sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
-
- slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
- slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
-
- sctx->ws = sscreen->ws;
- sctx->family = sscreen->info.family;
- sctx->chip_class = sscreen->info.chip_class;
-
- if (sctx->chip_class == GFX7 ||
- sctx->chip_class == GFX8 ||
- sctx->chip_class == GFX9) {
- sctx->eop_bug_scratch = si_resource(
- pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
- 16 * sscreen->info.num_render_backends));
- if (!sctx->eop_bug_scratch)
- goto fail;
- }
-
- /* Initialize context allocators. */
- sctx->allocator_zeroed_memory =
- u_suballocator_create(&sctx->b, 128 * 1024,
- 0, PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_UNMAPPABLE |
- SI_RESOURCE_FLAG_CLEAR, false);
- if (!sctx->allocator_zeroed_memory)
- goto fail;
-
- sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
- 0, PIPE_USAGE_STREAM,
- SI_RESOURCE_FLAG_READ_ONLY);
- if (!sctx->b.stream_uploader)
- goto fail;
-
- sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
- 0, PIPE_USAGE_STAGING, 0);
- if (!sctx->cached_gtt_allocator)
- goto fail;
-
- sctx->ctx = sctx->ws->ctx_create(sctx->ws);
- if (!sctx->ctx)
- goto fail;
-
- if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) {
- sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
- (void*)si_flush_dma_cs,
- sctx, stop_exec_on_failure);
- }
-
- bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs;
- sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
- 0, PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_32BIT |
- (use_sdma_upload ?
- SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
- if (!sctx->b.const_uploader)
- goto fail;
-
- if (use_sdma_upload)
- u_upload_enable_flush_explicit(sctx->b.const_uploader);
-
- sctx->gfx_cs = ws->cs_create(sctx->ctx,
- sctx->has_graphics ? RING_GFX : RING_COMPUTE,
- (void*)si_flush_gfx_cs, sctx, stop_exec_on_failure);
-
- /* Border colors. */
- sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
- sizeof(*sctx->border_color_table));
- if (!sctx->border_color_table)
- goto fail;
-
- sctx->border_color_buffer = si_resource(
- pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT,
- SI_MAX_BORDER_COLORS *
- sizeof(*sctx->border_color_table)));
- if (!sctx->border_color_buffer)
- goto fail;
-
- sctx->border_color_map =
- ws->buffer_map(sctx->border_color_buffer->buf,
- NULL, PIPE_TRANSFER_WRITE);
- if (!sctx->border_color_map)
- goto fail;
-
- sctx->ngg = sscreen->use_ngg;
-
- /* Initialize context functions used by graphics and compute. */
- if (sctx->chip_class >= GFX10)
- sctx->emit_cache_flush = gfx10_emit_cache_flush;
- else
- sctx->emit_cache_flush = si_emit_cache_flush;
-
- sctx->b.emit_string_marker = si_emit_string_marker;
- sctx->b.set_debug_callback = si_set_debug_callback;
- sctx->b.set_log_context = si_set_log_context;
- sctx->b.set_context_param = si_set_context_param;
- sctx->b.get_device_reset_status = si_get_reset_status;
- sctx->b.set_device_reset_callback = si_set_device_reset_callback;
-
- si_init_all_descriptors(sctx);
- si_init_buffer_functions(sctx);
- si_init_clear_functions(sctx);
- si_init_blit_functions(sctx);
- si_init_compute_functions(sctx);
- si_init_compute_blit_functions(sctx);
- si_init_debug_functions(sctx);
- si_init_fence_functions(sctx);
- si_init_query_functions(sctx);
- si_init_state_compute_functions(sctx);
- si_init_context_texture_functions(sctx);
-
- /* Initialize graphics-only context functions. */
- if (sctx->has_graphics) {
- if (sctx->chip_class >= GFX10)
- gfx10_init_query(sctx);
- si_init_msaa_functions(sctx);
- si_init_shader_functions(sctx);
- si_init_state_functions(sctx);
- si_init_streamout_functions(sctx);
- si_init_viewport_functions(sctx);
-
- sctx->blitter = util_blitter_create(&sctx->b);
- if (sctx->blitter == NULL)
- goto fail;
- sctx->blitter->skip_viewport_restore = true;
-
- /* Some states are expected to be always non-NULL. */
- sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
- sctx->queued.named.blend = sctx->noop_blend;
-
- sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
- sctx->queued.named.dsa = sctx->noop_dsa;
-
- sctx->discard_rasterizer_state =
- util_blitter_get_discard_rasterizer_state(sctx->blitter);
- sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
-
- si_init_draw_functions(sctx);
- si_initialize_prim_discard_tunables(sctx);
- }
-
- /* Initialize SDMA functions. */
- if (sctx->chip_class >= GFX7)
- cik_init_sdma_functions(sctx);
- else
- si_init_dma_functions(sctx);
-
- if (sscreen->debug_flags & DBG(FORCE_DMA))
- sctx->b.resource_copy_region = sctx->dma_copy;
-
- sctx->sample_mask = 0xffff;
-
- /* Initialize multimedia functions. */
- if (sscreen->info.has_hw_decode) {
- sctx->b.create_video_codec = si_uvd_create_decoder;
- sctx->b.create_video_buffer = si_video_buffer_create;
- } else {
- sctx->b.create_video_codec = vl_create_decoder;
- sctx->b.create_video_buffer = vl_video_buffer_create;
- }
-
- if (sctx->chip_class >= GFX9) {
- sctx->wait_mem_scratch = si_resource(
- pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
- if (!sctx->wait_mem_scratch)
- goto fail;
-
- /* Initialize the memory. */
- si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4,
- V_370_MEM, V_370_ME, &sctx->wait_mem_number);
- }
-
- /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
- * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
- if (sctx->chip_class == GFX7) {
- sctx->null_const_buf.buffer =
- pipe_aligned_buffer_create(screen,
- SI_RESOURCE_FLAG_32BIT,
- PIPE_USAGE_DEFAULT, 16,
- sctx->screen->info.tcc_cache_line_size);
- if (!sctx->null_const_buf.buffer)
- goto fail;
- sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
-
- unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
- for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
- for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
- sctx->b.set_constant_buffer(&sctx->b, shader, i,
- &sctx->null_const_buf);
- }
- }
-
- si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
- &sctx->null_const_buf);
- si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
- &sctx->null_const_buf);
- }
-
- uint64_t max_threads_per_block;
- screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
- PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
- &max_threads_per_block);
-
- /* The maximum number of scratch waves. Scratch space isn't divided
- * evenly between CUs. The number is only a function of the number of CUs.
- * We can decrease the constant to decrease the scratch buffer size.
- *
- * sctx->scratch_waves must be >= the maximum posible size of
- * 1 threadgroup, so that the hw doesn't hang from being unable
- * to start any.
- *
- * The recommended value is 4 per CU at most. Higher numbers don't
- * bring much benefit, but they still occupy chip resources (think
- * async compute). I've seen ~2% performance difference between 4 and 32.
- */
- sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units,
- max_threads_per_block / 64);
-
- si_init_compiler(sscreen, &sctx->compiler);
-
- /* Bindless handles. */
- sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
- _mesa_key_pointer_equal);
- sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
- _mesa_key_pointer_equal);
-
- util_dynarray_init(&sctx->resident_tex_handles, NULL);
- util_dynarray_init(&sctx->resident_img_handles, NULL);
- util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
- util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
- util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
-
- sctx->sample_pos_buffer =
- pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT,
- sizeof(sctx->sample_positions));
- pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0,
- sizeof(sctx->sample_positions), &sctx->sample_positions);
-
- /* this must be last */
- si_begin_new_gfx_cs(sctx);
-
- if (sctx->chip_class == GFX7) {
- /* Clear the NULL constant buffer, because loads should return zeros.
- * Note that this forces CP DMA to be used, because clover deadlocks
- * for some reason when the compute codepath is used.
- */
- uint32_t clear_value = 0;
- si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
- sctx->null_const_buf.buffer->width0,
- &clear_value, 4, SI_COHERENCY_SHADER, true);
- }
- return &sctx->b;
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ STATIC_ASSERT(DBG_COUNT <= 64);
+
+ /* Don't create a context if it's not compute-only and hw is compute-only. */
+ if (!sscreen->info.has_graphics && !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
+ return NULL;
+
+ struct si_context *sctx = CALLOC_STRUCT(si_context);
+ struct radeon_winsys *ws = sscreen->ws;
+ int shader, i;
+ bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
+
+ if (!sctx)
+ return NULL;
+
+ sctx->has_graphics = sscreen->info.chip_class == GFX6 || !(flags & PIPE_CONTEXT_COMPUTE_ONLY);
+
+ if (flags & PIPE_CONTEXT_DEBUG)
+ sscreen->record_llvm_ir = true; /* racy but not critical */
+
+ sctx->b.screen = screen; /* this must be set first */
+ sctx->b.priv = NULL;
+ sctx->b.destroy = si_destroy_context;
+ sctx->screen = sscreen; /* Easy accessing of screen/winsys. */
+ sctx->is_debug = (flags & PIPE_CONTEXT_DEBUG) != 0;
+
+ slab_create_child(&sctx->pool_transfers, &sscreen->pool_transfers);
+ slab_create_child(&sctx->pool_transfers_unsync, &sscreen->pool_transfers);
+
+ sctx->ws = sscreen->ws;
+ sctx->family = sscreen->info.family;
+ sctx->chip_class = sscreen->info.chip_class;
+
+ if (sctx->chip_class == GFX7 || sctx->chip_class == GFX8 || sctx->chip_class == GFX9) {
+ sctx->eop_bug_scratch = si_resource(pipe_buffer_create(
+ &sscreen->b, 0, PIPE_USAGE_DEFAULT, 16 * sscreen->info.num_render_backends));
+ if (!sctx->eop_bug_scratch)
+ goto fail;
+ }
+
+ /* Initialize context allocators. */
+ sctx->allocator_zeroed_memory =
+ u_suballocator_create(&sctx->b, 128 * 1024, 0, PIPE_USAGE_DEFAULT,
+ SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_CLEAR, false);
+ if (!sctx->allocator_zeroed_memory)
+ goto fail;
+
+ sctx->b.stream_uploader =
+ u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY);
+ if (!sctx->b.stream_uploader)
+ goto fail;
+
+ sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0);
+ if (!sctx->cached_gtt_allocator)
+ goto fail;
+
+ sctx->ctx = sctx->ws->ctx_create(sctx->ws);
+ if (!sctx->ctx)
+ goto fail;
+
+ if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) &&
+ /* SDMA causes corruption on RX 580:
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1399
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1889
+ */
+ (sctx->chip_class != GFX8 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
+ /* SDMA causes corruption on gfx9 APUs:
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2814
+ *
+ * While we could keep buffer copies and clears enabled, let's disable
+ * everything, because neither gfx8 nor gfx10 enable SDMA, and it's not
+ * easy to test.
+ */
+ (sctx->chip_class != GFX9 || sscreen->debug_flags & DBG(FORCE_SDMA)) &&
+ /* SDMA timeouts sometimes on gfx10 so disable it for now. See:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=111481
+ * https://gitlab.freedesktop.org/mesa/mesa/-/issues/1907
+ */
+ (sctx->chip_class != GFX10 || sscreen->debug_flags & DBG(FORCE_SDMA))) {
+ sctx->sdma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void *)si_flush_dma_cs, sctx,
+ stop_exec_on_failure);
+ }
+
+ bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs;
+ sctx->b.const_uploader =
+ u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT,
+ SI_RESOURCE_FLAG_32BIT |
+ (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
+ if (!sctx->b.const_uploader)
+ goto fail;
+
+ if (use_sdma_upload)
+ u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
+ sctx->gfx_cs = ws->cs_create(sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE,
+ (void *)si_flush_gfx_cs, sctx, stop_exec_on_failure);
+
+ /* Border colors. */
+ sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table));
+ if (!sctx->border_color_table)
+ goto fail;
+
+ sctx->border_color_buffer = si_resource(pipe_buffer_create(
+ screen, 0, PIPE_USAGE_DEFAULT, SI_MAX_BORDER_COLORS * sizeof(*sctx->border_color_table)));
+ if (!sctx->border_color_buffer)
+ goto fail;
+
+ sctx->border_color_map =
+ ws->buffer_map(sctx->border_color_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
+ if (!sctx->border_color_map)
+ goto fail;
+
+ sctx->ngg = sscreen->use_ngg;
+
+ /* Initialize context functions used by graphics and compute. */
+ if (sctx->chip_class >= GFX10)
+ sctx->emit_cache_flush = gfx10_emit_cache_flush;
+ else
+ sctx->emit_cache_flush = si_emit_cache_flush;
+
+ sctx->b.emit_string_marker = si_emit_string_marker;
+ sctx->b.set_debug_callback = si_set_debug_callback;
+ sctx->b.set_log_context = si_set_log_context;
+ sctx->b.set_context_param = si_set_context_param;
+ sctx->b.get_device_reset_status = si_get_reset_status;
+ sctx->b.set_device_reset_callback = si_set_device_reset_callback;
+
+ si_init_all_descriptors(sctx);
+ si_init_buffer_functions(sctx);
+ si_init_clear_functions(sctx);
+ si_init_blit_functions(sctx);
+ si_init_compute_functions(sctx);
+ si_init_compute_blit_functions(sctx);
+ si_init_debug_functions(sctx);
+ si_init_fence_functions(sctx);
+ si_init_query_functions(sctx);
+ si_init_state_compute_functions(sctx);
+ si_init_context_texture_functions(sctx);
+
+ /* Initialize graphics-only context functions. */
+ if (sctx->has_graphics) {
+ if (sctx->chip_class >= GFX10)
+ gfx10_init_query(sctx);
+ si_init_msaa_functions(sctx);
+ si_init_shader_functions(sctx);
+ si_init_state_functions(sctx);
+ si_init_streamout_functions(sctx);
+ si_init_viewport_functions(sctx);
+
+ sctx->blitter = util_blitter_create(&sctx->b);
+ if (sctx->blitter == NULL)
+ goto fail;
+ sctx->blitter->skip_viewport_restore = true;
+
+ /* Some states are expected to be always non-NULL. */
+ sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
+ sctx->queued.named.blend = sctx->noop_blend;
+
+ sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
+ sctx->queued.named.dsa = sctx->noop_dsa;
+
+ sctx->discard_rasterizer_state = util_blitter_get_discard_rasterizer_state(sctx->blitter);
+ sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
+
+ si_init_draw_functions(sctx);
+
+ /* If aux_context == NULL, we are initializing aux_context right now. */
+ bool is_aux_context = !sscreen->aux_context;
+ si_initialize_prim_discard_tunables(sscreen, is_aux_context,
+ &sctx->prim_discard_vertex_count_threshold,
+ &sctx->index_ring_size_per_ib);
+ }
+
+ /* Initialize SDMA functions. */
+ if (sctx->chip_class >= GFX7)
+ cik_init_sdma_functions(sctx);
+ else
+ sctx->dma_copy = si_resource_copy_region;
+
+ if (sscreen->debug_flags & DBG(FORCE_SDMA))
+ sctx->b.resource_copy_region = sctx->dma_copy;
+
+ sctx->sample_mask = 0xffff;
+
+ /* Initialize multimedia functions. */
+ if (sscreen->info.has_hw_decode) {
+ sctx->b.create_video_codec = si_uvd_create_decoder;
+ sctx->b.create_video_buffer = si_video_buffer_create;
+ } else {
+ sctx->b.create_video_codec = vl_create_decoder;
+ sctx->b.create_video_buffer = vl_video_buffer_create;
+ }
+
+ if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) {
+ sctx->wait_mem_scratch =
+ si_aligned_buffer_create(screen, SI_RESOURCE_FLAG_UNMAPPABLE,
+ PIPE_USAGE_DEFAULT, 8,
+ sscreen->info.tcc_cache_line_size);
+ if (!sctx->wait_mem_scratch)
+ goto fail;
+ }
+
+ /* GFX7 cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
+ * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
+ if (sctx->chip_class == GFX7) {
+ sctx->null_const_buf.buffer =
+ pipe_aligned_buffer_create(screen, SI_RESOURCE_FLAG_32BIT, PIPE_USAGE_DEFAULT, 16,
+ sctx->screen->info.tcc_cache_line_size);
+ if (!sctx->null_const_buf.buffer)
+ goto fail;
+ sctx->null_const_buf.buffer_size = sctx->null_const_buf.buffer->width0;
+
+ unsigned start_shader = sctx->has_graphics ? 0 : PIPE_SHADER_COMPUTE;
+ for (shader = start_shader; shader < SI_NUM_SHADERS; shader++) {
+ for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
+ sctx->b.set_constant_buffer(&sctx->b, shader, i, &sctx->null_const_buf);
+ }
+ }
+
+ si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf);
+ si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf);
+ }
+
+ uint64_t max_threads_per_block;
+ screen->get_compute_param(screen, PIPE_SHADER_IR_NIR, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
+ &max_threads_per_block);
+
+ /* The maximum number of scratch waves. Scratch space isn't divided
+ * evenly between CUs. The number is only a function of the number of CUs.
+ * We can decrease the constant to decrease the scratch buffer size.
+ *
+ * sctx->scratch_waves must be >= the maximum posible size of
+ * 1 threadgroup, so that the hw doesn't hang from being unable
+ * to start any.
+ *
+ * The recommended value is 4 per CU at most. Higher numbers don't
+ * bring much benefit, but they still occupy chip resources (think
+ * async compute). I've seen ~2% performance difference between 4 and 32.
+ */
+ sctx->scratch_waves =
+ MAX2(32 * sscreen->info.num_good_compute_units, max_threads_per_block / 64);
+
+ /* Bindless handles. */
+ sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+ sctx->img_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+ util_dynarray_init(&sctx->resident_tex_handles, NULL);
+ util_dynarray_init(&sctx->resident_img_handles, NULL);
+ util_dynarray_init(&sctx->resident_tex_needs_color_decompress, NULL);
+ util_dynarray_init(&sctx->resident_img_needs_color_decompress, NULL);
+ util_dynarray_init(&sctx->resident_tex_needs_depth_decompress, NULL);
+
+ sctx->sample_pos_buffer =
+ pipe_buffer_create(sctx->b.screen, 0, PIPE_USAGE_DEFAULT, sizeof(sctx->sample_positions));
+ pipe_buffer_write(&sctx->b, sctx->sample_pos_buffer, 0, sizeof(sctx->sample_positions),
+ &sctx->sample_positions);
+
+ /* The remainder of this function initializes the gfx CS and must be last. */
+ assert(sctx->gfx_cs->current.cdw == 0);
+
+ if (sctx->has_graphics) {
+ si_init_cp_reg_shadowing(sctx);
+ }
+
+ si_begin_new_gfx_cs(sctx, true);
+ assert(sctx->gfx_cs->current.cdw == sctx->initial_gfx_cs_size);
+
+ /* Initialize per-context buffers. */
+ if (sctx->wait_mem_scratch) {
+ si_cp_write_data(sctx, sctx->wait_mem_scratch, 0, 4, V_370_MEM, V_370_ME,
+ &sctx->wait_mem_number);
+ }
+
+ if (sctx->chip_class == GFX7) {
+ /* Clear the NULL constant buffer, because loads should return zeros.
+ * Note that this forces CP DMA to be used, because clover deadlocks
+ * for some reason when the compute codepath is used.
+ */
+ uint32_t clear_value = 0;
+ si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0,
+ &clear_value, 4, SI_COHERENCY_SHADER, true);
+ }
+
+ sctx->initial_gfx_cs_size = sctx->gfx_cs->current.cdw;
+ return &sctx->b;