X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_pipe.h;h=d32feab52c2614bb99dbc22895855c48f6b7db41;hb=c4c17ab3ec1d67b0f2fd9816681378bdc8efe220;hp=b6ef60cbe3eb561f77f7297ed4ebdc8b9d442423;hpb=166250f4e5486e1e44ed97a8ab2ee0691e41cfa1;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b6ef60cbe3e..d32feab52c2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -39,7 +39,7 @@ #endif #define ATI_VENDOR_ID 0x1002 - +#define SI_PRIM_DISCARD_DEBUG 0 #define SI_NOT_QUERY 0xffffffff /* The base vertex and primitive restart can be any number, but we must pick @@ -47,11 +47,18 @@ * the number shouldn't be a commonly-used one. */ #define SI_BASE_VERTEX_UNKNOWN INT_MIN #define SI_RESTART_INDEX_UNKNOWN INT_MIN +#define SI_INSTANCE_COUNT_UNKNOWN INT_MIN #define SI_NUM_SMOOTH_AA_SAMPLES 8 +#define SI_MAX_POINT_SIZE 2048 #define SI_GS_PER_ES 128 /* Alignment for optimal CP DMA performance. */ #define SI_CPDMA_ALIGNMENT 32 +/* Tunables for compute-based clear_buffer and copy_buffer: */ +#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 +#define SI_COMPUTE_COPY_DW_PER_THREAD 4 +#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM + /* Pipeline & streamout query controls. */ #define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) #define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1) @@ -65,7 +72,7 @@ /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ #define SI_CONTEXT_INV_GLOBAL_L2 (1 << 6) /* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't - * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */ + * invalidate L2. GFX6-GFX7 can't do it, so they will do complete invalidation. */ #define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (1 << 7) /* Writeback & invalidate the L2 metadata cache. It can only be coupled with * a CB or DB flush. */ @@ -97,11 +104,26 @@ #define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) -#define SI_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) +#define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) #define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) #define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) #define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5) #define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6) +#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) +/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */ +#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8) + +enum si_clear_code +{ + DCC_CLEAR_COLOR_0000 = 0x00000000, + DCC_CLEAR_COLOR_0001 = 0x40404040, + DCC_CLEAR_COLOR_1110 = 0x80808080, + DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0, + DCC_CLEAR_COLOR_REG = 0x20202020, + DCC_UNCOMPRESSED = 0xFFFFFFFF, +}; + +#define SI_IMAGE_ACCESS_AS_BUFFER (1 << 7) /* Debug flags. */ enum { @@ -121,10 +143,10 @@ enum { DBG_FS_CORRECT_DERIVS_AFTER_KILL, DBG_UNSAFE_MATH, DBG_SI_SCHED, + DBG_GISEL, /* Shader compiler options (with no effect on the shader cache): */ DBG_CHECK_IR, - DBG_NIR, DBG_MONOLITHIC_SHADERS, DBG_NO_OPT_VARIANT, @@ -143,6 +165,9 @@ enum { DBG_ZERO_VRAM, /* 3D engine options: */ + DBG_ALWAYS_PD, + DBG_PD, + DBG_NO_PD, DBG_SWITCH_ON_EOP, DBG_NO_OUT_OF_ORDER, DBG_NO_DPBB, @@ -164,19 +189,37 @@ enum { DBG_TEST_VMFAULT_CP, DBG_TEST_VMFAULT_SDMA, DBG_TEST_VMFAULT_SHADER, + DBG_TEST_DMA_PERF, + DBG_TEST_GDS, + DBG_TEST_GDS_MM, + DBG_TEST_GDS_OA_MM, }; #define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1)) #define DBG(name) (1ull << DBG_##name) +enum si_cache_policy { + L2_BYPASS, + L2_STREAM, /* same as SLC=1 */ + L2_LRU, /* same as SLC=0 */ +}; + +enum si_coherency { + SI_COHERENCY_NONE, /* no cache flushes needed */ + SI_COHERENCY_SHADER, + SI_COHERENCY_CB_META, + SI_COHERENCY_CP, +}; + struct si_compute; +struct si_shader_context; struct hash_table; struct u_suballocator; /* Only 32-bit buffer allocations are supported, gallium doesn't support more * at the moment. */ -struct r600_resource { +struct si_resource { struct threaded_resource b; /* Winsys objects. */ @@ -224,35 +267,39 @@ struct r600_resource { unsigned external_usage; /* PIPE_HANDLE_USAGE_* */ }; -struct r600_transfer { +struct si_transfer { struct threaded_transfer b; - struct r600_resource *staging; + struct si_resource *staging; unsigned offset; }; -struct r600_cmask_info { - uint64_t offset; - uint64_t base_address_reg; - uint32_t size; -}; - struct si_texture { - struct r600_resource buffer; + struct si_resource buffer; struct radeon_surf surface; uint64_t size; struct si_texture *flushed_depth_texture; - /* Colorbuffer compression and fast clear. */ + /* One texture allocation can contain these buffers: + * - image (pixel data) + * - FMASK buffer (MSAA compression) + * - CMASK buffer (MSAA compression and/or legacy fast color clear) + * - HTILE buffer (Z/S compression and fast Z/S clear) + * - DCC buffer (color compression and new fast color clear) + * - displayable DCC buffer (if the DCC buffer is not displayable) + * - DCC retile mapping buffer (if the DCC buffer is not displayable) + */ uint64_t fmask_offset; - struct r600_cmask_info cmask; - struct r600_resource *cmask_buffer; + uint64_t cmask_offset; + uint64_t cmask_base_address_reg; + struct si_resource *cmask_buffer; uint64_t dcc_offset; /* 0 = disabled */ + uint64_t display_dcc_offset; + uint64_t dcc_retile_map_offset; unsigned cb_color_info; /* fast clear enable bit */ unsigned color_clear_value[2]; unsigned last_msaa_resolve_target_micro_mode; unsigned num_level0_transfers; - unsigned num_color_samples; /* Depth buffer compression and fast clear. */ uint64_t htile_offset; @@ -290,9 +337,9 @@ struct si_texture { * target == 2D and last_level == 0. If enabled, dcc_offset contains * the absolute GPUVM address, not the relative one. */ - struct r600_resource *dcc_separate_buffer; + struct si_resource *dcc_separate_buffer; /* When DCC is temporarily disabled, the separate buffer is here. */ - struct r600_resource *last_dcc_separate_buffer; + struct si_resource *last_dcc_separate_buffer; /* Estimate of how much this color buffer is written to in units of * full-screen draws: ps_invocations / (width * height) * Shader kills, late Z, and blending with trivial discards make it @@ -323,7 +370,7 @@ struct si_surface { unsigned cb_color_view; unsigned cb_color_attrib; unsigned cb_color_attrib2; /* GFX9 and later */ - unsigned cb_dcc_control; /* VI and later */ + unsigned cb_dcc_control; /* GFX8 and later */ unsigned spi_shader_col_format:8; /* no blending, no alpha-to-coverage. */ unsigned spi_shader_col_format_alpha:8; /* alpha-to-coverage */ unsigned spi_shader_col_format_blend:8; /* blending without alpha. */ @@ -408,6 +455,9 @@ struct si_screen { uint64_t debug_flags; char renderer_string[183]; + unsigned pa_sc_raster_config; + unsigned pa_sc_raster_config_1; + unsigned se_tile_repeat; unsigned gs_table_depth; unsigned tess_offchip_block_dw_size; unsigned tess_offchip_ring_size; @@ -422,13 +472,19 @@ struct si_screen { bool has_out_of_order_rast; bool assume_no_z_fights; bool commutative_blend_add; - bool clear_db_cache_before_clear; + bool has_gfx9_scissor_bug; bool has_msaa_sample_loc_bug; bool has_ls_vgpr_init_bug; + bool has_dcc_constant_encode; bool dpbb_allowed; bool dfsm_allowed; bool llvm_has_working_vgpr_indexing; + struct { +#define OPT_BOOL(name, dflt, description) bool name:1; +#include "si_debug_options.h" + } options; + /* Whether shaders are monolithic (1-part) or separate (3-part). */ bool use_monolithic_shaders; bool record_llvm_ir; @@ -474,6 +530,7 @@ struct si_screen { * the counter before drawing and re-emit the states accordingly. */ unsigned dirty_tex_counter; + unsigned dirty_buf_counter; /* Atomically increment this counter when an existing texture's * metadata is enabled or disabled in a way that requires changing @@ -520,12 +577,12 @@ struct si_screen { /* Use at most 3 normal compiler threads on quadcore and better. * Hyperthreaded CPUs report the number of threads, but we want * the number of cores. We only need this many threads for shader-db. */ - struct si_compiler compiler[24]; /* used by the queue only */ + struct ac_llvm_compiler compiler[24]; /* used by the queue only */ struct util_queue shader_compiler_queue_low_priority; /* Use at most 2 low priority threads on quadcore and better. * We want to minimize the impact on multithreaded Mesa. */ - struct si_compiler compiler_lowp[10]; + struct ac_llvm_compiler compiler_lowp[10]; }; struct si_blend_color { @@ -550,7 +607,7 @@ struct si_sampler_view { #define SI_SAMPLER_STATE_MAGIC 0x34f1c35a struct si_sampler_state { -#ifdef DEBUG +#ifndef NDEBUG unsigned magic; #endif uint32_t val[4]; @@ -597,10 +654,19 @@ struct si_framebuffer { ubyte color_is_int8; ubyte color_is_int10; ubyte dirty_cbufs; + ubyte dcc_overwrite_combiner_watermark; bool dirty_zsbuf; bool any_dst_linear; bool CB_has_shader_readable_metadata; bool DB_has_shader_readable_metadata; + bool all_DCC_pipe_aligned; +}; + +enum si_quant_mode { + /* This is the list we want to support. */ + SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH, + SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH, + SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH, }; struct si_signed_scissor { @@ -608,18 +674,13 @@ struct si_signed_scissor { int miny; int maxx; int maxy; -}; - -struct si_scissors { - unsigned dirty_mask; - struct pipe_scissor_state states[SI_MAX_VIEWPORTS]; + enum si_quant_mode quant_mode; }; struct si_viewports { - unsigned dirty_mask; - unsigned depth_range_dirty_mask; struct pipe_viewport_state states[SI_MAX_VIEWPORTS]; struct si_signed_scissor as_scissor[SI_MAX_VIEWPORTS]; + bool y_inverted; }; struct si_clip_state { @@ -631,7 +692,7 @@ struct si_streamout_target { struct pipe_stream_output_target b; /* The buffer where BUFFER_FILLED_SIZE is stored. */ - struct r600_resource *buf_filled_size; + struct si_resource *buf_filled_size; unsigned buf_filled_size_offset; bool buf_filled_size_valid; @@ -725,14 +786,24 @@ struct si_saved_cs { struct pipe_reference reference; struct si_context *ctx; struct radeon_saved_cs gfx; - struct r600_resource *trace_buf; + struct radeon_saved_cs compute; + struct si_resource *trace_buf; unsigned trace_id; unsigned gfx_last_dw; + unsigned compute_last_dw; bool flushed; int64_t time_flush; }; +struct si_sdma_upload { + struct si_resource *dst; + struct si_resource *src; + unsigned src_offset; + unsigned dst_offset; + unsigned size; +}; + struct si_context { struct pipe_context b; /* base class */ @@ -741,11 +812,11 @@ struct si_context { struct radeon_winsys *ws; struct radeon_winsys_ctx *ctx; - struct radeon_cmdbuf *gfx_cs; + struct radeon_cmdbuf *gfx_cs; /* compute IB if graphics is disabled */ struct radeon_cmdbuf *dma_cs; struct pipe_fence_handle *last_gfx_fence; struct pipe_fence_handle *last_sdma_fence; - struct r600_resource *eop_bug_scratch; + struct si_resource *eop_bug_scratch; struct u_upload_mgr *cached_gtt_allocator; struct threaded_context *tc; struct u_suballocator *allocator_zeroed_memory; @@ -765,22 +836,31 @@ struct si_context { void *vs_blit_color; void *vs_blit_color_layered; void *vs_blit_texcoord; + void *cs_clear_buffer; + void *cs_copy_buffer; + void *cs_copy_image; + void *cs_copy_image_1d_array; + void *cs_clear_render_target; + void *cs_clear_render_target_1d_array; + void *cs_dcc_retile; struct si_screen *screen; struct pipe_debug_callback debug; - struct si_compiler compiler; /* only non-threaded compilation */ + struct ac_llvm_compiler compiler; /* only non-threaded compilation */ struct si_shader_ctx_state fixed_func_tcs_shader; - struct r600_resource *wait_mem_scratch; + /* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */ + struct si_resource *wait_mem_scratch; unsigned wait_mem_number; uint16_t prefetch_L2_mask; + bool has_graphics; bool gfx_flush_in_progress:1; bool gfx_last_ib_is_busy:1; bool compute_is_busy:1; unsigned num_gfx_cs_flushes; unsigned initial_gfx_cs_size; - unsigned gpu_reset_counter; unsigned last_dirty_tex_counter; + unsigned last_dirty_buf_counter; unsigned last_compressed_colortex_counter; unsigned last_num_draw_calls; unsigned flags; /* flush flags */ @@ -788,6 +868,31 @@ struct si_context { uint64_t vram; uint64_t gtt; + /* Compute-based primitive discard. */ + unsigned prim_discard_vertex_count_threshold; + struct pb_buffer *gds; + struct pb_buffer *gds_oa; + struct radeon_cmdbuf *prim_discard_compute_cs; + unsigned compute_gds_offset; + struct si_shader *compute_ib_last_shader; + uint32_t compute_rewind_va; + unsigned compute_num_prims_in_batch; + bool preserve_prim_restart_gds_at_flush; + /* index_ring is divided into 2 halves for doublebuffering. */ + struct si_resource *index_ring; + unsigned index_ring_base; /* offset of a per-IB portion */ + unsigned index_ring_offset; /* offset within a per-IB portion */ + unsigned index_ring_size_per_ib; /* max available size per IB */ + bool prim_discard_compute_ib_initialized; + /* For tracking the last execution barrier - it can be either + * a WRITE_DATA packet or a fence. */ + uint32_t *last_pkt3_write_data; + struct si_resource *barrier_buf; + unsigned barrier_buf_offset; + struct pipe_fence_handle *last_ib_barrier_fence; + struct si_resource *last_ib_barrier_buf; + unsigned last_ib_barrier_buf_offset; + /* Atoms (direct states). */ union si_state_atoms atoms; unsigned dirty_atoms; /* mask */ @@ -805,9 +910,12 @@ struct si_context { struct si_clip_state clip_state; struct si_shader_data shader_pointers; struct si_stencil_ref stencil_ref; - struct si_scissors scissors; + struct pipe_scissor_state scissors[SI_MAX_VIEWPORTS]; struct si_streamout streamout; struct si_viewports viewports; + unsigned num_window_rectangles; + bool window_rectangles_include; + struct pipe_scissor_state window_rectangles[4]; /* Precomputed states. */ struct si_pm4_state *init_config; @@ -821,17 +929,19 @@ struct si_context { struct si_shader_ctx_state vs_shader; struct si_shader_ctx_state tcs_shader; struct si_shader_ctx_state tes_shader; + struct si_shader_ctx_state cs_prim_discard_state; struct si_cs_shader_state cs_shader_state; /* shader information */ struct si_vertex_elements *vertex_elements; unsigned sprite_coord_enable; + unsigned cs_max_waves_per_sh; bool flatshade; bool do_update_shaders; /* vertex buffer descriptors */ uint32_t *vb_descriptors_gpu_list; - struct r600_resource *vb_descriptors_buffer; + struct si_resource *vb_descriptors_buffer; unsigned vb_descriptors_offset; /* shader descriptors */ @@ -843,23 +953,28 @@ struct si_context { struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS]; struct si_samplers samplers[SI_NUM_SHADERS]; struct si_images images[SI_NUM_SHADERS]; + bool bo_list_add_all_resident_resources; + bool bo_list_add_all_gfx_resources; + bool bo_list_add_all_compute_resources; /* other shader resources */ - struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */ + struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */ struct pipe_resource *esgs_ring; struct pipe_resource *gsvs_ring; struct pipe_resource *tess_rings; union pipe_color_union *border_color_table; /* in CPU memory, any endian */ - struct r600_resource *border_color_buffer; + struct si_resource *border_color_buffer; union pipe_color_union *border_color_map; /* in VRAM (slow access), little endian */ unsigned border_color_count; unsigned num_vs_blit_sgprs; uint32_t vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD]; + uint32_t cs_user_data[4]; /* Vertex and index buffers. */ bool vertex_buffers_dirty; bool vertex_buffer_pointer_dirty; struct pipe_vertex_buffer vertex_buffer[SI_NUM_VERTEX_BUFFERS]; + uint16_t vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */ /* MSAA config state. */ int ps_iter_samples; @@ -883,9 +998,11 @@ struct si_context { /* Emitted draw state. */ bool gs_tri_strip_adj_fix:1; bool ls_vgpr_fix:1; + bool prim_discard_cs_instancing:1; int last_index_size; int last_base_vertex; int last_start_instance; + int last_instance_count; int last_drawid; int last_sh_base_reg; int last_primitive_restart_en; @@ -899,11 +1016,11 @@ struct si_context { enum pipe_prim_type current_rast_prim; /* primitive type after TES, GS */ /* Scratch buffer */ - struct r600_resource *scratch_buffer; + struct si_resource *scratch_buffer; unsigned scratch_waves; unsigned spi_tmpring_size; - struct r600_resource *compute_scratch_buffer; + struct si_resource *compute_scratch_buffer; /* Emitted derived tessellation state. */ /* Local shader (VS), or HS if LS-HS are merged. */ @@ -960,11 +1077,14 @@ struct si_context { /* MSAA sample locations. * The first index is the sample index. * The second index is the coordinate: X, Y. */ - float sample_locations_1x[1][2]; - float sample_locations_2x[2][2]; - float sample_locations_4x[4][2]; - float sample_locations_8x[8][2]; - float sample_locations_16x[16][2]; + struct { + float x1[1][2]; + float x2[2][2]; + float x4[4][2]; + float x8[8][2]; + float x16[16][2]; + } sample_positions; + struct pipe_resource *sample_pos_buffer; /* Misc stats. */ unsigned num_draw_calls; @@ -986,11 +1106,16 @@ struct si_context { unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ + unsigned compute_num_verts_accepted; + unsigned compute_num_verts_rejected; + unsigned compute_num_verts_ineligible; /* due to low vertex count */ + unsigned context_roll; /* Queries. */ /* Maintain the list of active queries for pausing between IBs. */ int num_occlusion_queries; int num_perfect_occlusion_queries; + int num_pipeline_stat_queries; struct list_head active_queries; unsigned num_cs_dw_queries_suspend; @@ -1000,6 +1125,12 @@ struct si_context { bool render_cond_invert; bool render_cond_force_off; /* for u_blitter */ + /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */ + bool sdma_uploads_in_progress; + struct si_sdma_upload *sdma_uploads; + unsigned num_sdma_uploads; + unsigned max_sdma_uploads; + /* Statistics gathering for the DCC enablement heuristic. It can't be * in si_texture because si_texture can be shared by multiple * contexts. This is for back buffers only. We shouldn't get too many @@ -1028,9 +1159,6 @@ struct si_context { unsigned src_level, const struct pipe_box *src_box); - void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value); - struct si_tracked_regs tracked_regs; }; @@ -1070,17 +1198,17 @@ bool si_rings_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf, enum radeon_bo_usage usage); void *si_buffer_map_sync_with_rings(struct si_context *sctx, - struct r600_resource *resource, + struct si_resource *resource, unsigned usage); void si_init_resource_fields(struct si_screen *sscreen, - struct r600_resource *res, + struct si_resource *res, uint64_t size, unsigned alignment); bool si_alloc_resource(struct si_screen *sscreen, - struct r600_resource *res); + struct si_resource *res); struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, unsigned usage, unsigned size, unsigned alignment); -struct r600_resource *si_aligned_buffer_create(struct pipe_screen *screen, +struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, unsigned usage, unsigned size, unsigned alignment); void si_replace_buffer_storage(struct pipe_context *ctx, @@ -1097,6 +1225,32 @@ void vi_dcc_clear_level(struct si_context *sctx, unsigned level, unsigned clear_value); void si_init_clear_functions(struct si_context *sctx); +/* si_compute_blit.c */ +unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, + enum si_cache_policy cache_policy); +void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, uint32_t *clear_value, + uint32_t clear_value_size, enum si_coherency coher, + bool force_cpdma); +void si_copy_buffer(struct si_context *sctx, + struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size); +void si_compute_copy_image(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_level, + struct pipe_resource *src, + unsigned src_level, + unsigned dstx, unsigned dsty, unsigned dstz, + const struct pipe_box *src_box); +void si_compute_clear_render_target(struct pipe_context *ctx, + struct pipe_surface *dstsurf, + const union pipe_color_union *color, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height, + bool render_condition_enabled); +void si_retile_dcc(struct si_context *sctx, struct si_texture *tex); +void si_init_compute_blit_functions(struct si_context *sctx); + /* si_cp_dma.c */ #define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */ #define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */ @@ -1109,24 +1263,26 @@ void si_init_clear_functions(struct si_context *sctx); SI_CPDMA_SKIP_GFX_SYNC | \ SI_CPDMA_SKIP_BO_LIST_UPDATE) -enum si_coherency { - SI_COHERENCY_NONE, /* no cache flushes needed */ - SI_COHERENCY_SHADER, - SI_COHERENCY_CB_META, -}; - void si_cp_dma_wait_for_idle(struct si_context *sctx); -void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher); -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size, - unsigned user_flags); +void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, + struct pipe_resource *dst, uint64_t offset, + uint64_t size, unsigned value, unsigned user_flags, + enum si_coherency coher, enum si_cache_policy cache_policy); +void si_cp_dma_copy_buffer(struct si_context *sctx, + struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size, + unsigned user_flags, enum si_coherency coher, + enum si_cache_policy cache_policy); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only); -void si_init_cp_dma_functions(struct si_context *sctx); +void si_test_gds(struct si_context *sctx); +void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, + unsigned offset, unsigned size, unsigned dst_sel, + unsigned engine, const void *data); +void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, + unsigned dst_sel, struct si_resource *dst, unsigned dst_offset, + unsigned src_sel, struct si_resource *src, unsigned src_offset); /* si_debug.c */ void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, @@ -1140,35 +1296,38 @@ void si_log_compute_state(struct si_context *sctx, struct u_log_context *log); void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring); -bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); +bool si_replace_shader(unsigned num, struct si_shader_binary *binary); /* si_dma.c */ void si_init_dma_functions(struct si_context *sctx); /* si_dma_cs.c */ +void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, + uint64_t offset); +void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned clear_value); void si_need_dma_space(struct si_context *ctx, unsigned num_dw, - struct r600_resource *dst, struct r600_resource *src); + struct si_resource *dst, struct si_resource *src); void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value); /* si_fence.c */ -void si_gfx_write_event_eop(struct si_context *ctx, - unsigned event, unsigned event_flags, - unsigned data_sel, - struct r600_resource *buf, uint64_t va, - uint32_t new_fence, unsigned query_type); -unsigned si_gfx_write_fence_dwords(struct si_screen *screen); -void si_gfx_wait_fence(struct si_context *ctx, - uint64_t va, uint32_t ref, uint32_t mask); +void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, + unsigned event, unsigned event_flags, + unsigned dst_sel, unsigned int_sel, unsigned data_sel, + struct si_resource *buf, uint64_t va, + uint32_t new_fence, unsigned query_type); +unsigned si_cp_write_fence_dwords(struct si_screen *screen); +void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, + uint64_t va, uint32_t ref, uint32_t mask, unsigned flags); void si_init_fence_functions(struct si_context *ctx); void si_init_screen_fence_functions(struct si_screen *screen); struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, struct tc_unflushed_batch_token *tc_token); /* si_get.c */ -const char *si_get_family_name(const struct si_screen *sscreen); void si_init_screen_get_functions(struct si_screen *sscreen); /* si_gfx_cs.c */ @@ -1176,21 +1335,46 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_begin_new_gfx_cs(struct si_context *ctx); void si_need_gfx_cs_space(struct si_context *ctx); +void si_unref_sdma_uploads(struct si_context *sctx); -/* r600_gpu_load.c */ +/* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type); unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin); /* si_compute.c */ +void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs); +unsigned si_get_compute_resource_limits(struct si_screen *sscreen, + unsigned waves_per_threadgroup, + unsigned max_waves_per_sh, + unsigned threadgroups_per_cu); void si_init_compute_functions(struct si_context *sctx); -/* r600_perfcounters.c */ -void si_perfcounters_destroy(struct si_screen *sscreen); +/* si_compute_prim_discard.c */ +enum si_prim_discard_outcome { + SI_PRIM_DISCARD_ENABLED, + SI_PRIM_DISCARD_DISABLED, + SI_PRIM_DISCARD_DRAW_SPLIT, +}; + +void si_build_prim_discard_compute_shader(struct si_shader_context *ctx); +enum si_prim_discard_outcome +si_prepare_prim_discard_or_split_draw(struct si_context *sctx, + const struct pipe_draw_info *info, + bool primitive_restart); +void si_compute_signal_gfx(struct si_context *sctx); +void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, + const struct pipe_draw_info *info, + unsigned index_size, + unsigned base_vertex, + uint64_t input_indexbuf_va, + unsigned input_indexbuf_max_elements); +void si_initialize_prim_discard_tunables(struct si_context *sctx); /* si_perfcounters.c */ void si_init_perfcounters(struct si_screen *screen); +void si_destroy_perfcounters(struct si_screen *screen); /* si_pipe.c */ bool si_check_device_reset(struct si_context *sctx); @@ -1201,9 +1385,26 @@ void si_init_query_functions(struct si_context *sctx); void si_suspend_queries(struct si_context *sctx); void si_resume_queries(struct si_context *sctx); +/* si_shaderlib_tgsi.c */ +void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, + unsigned num_layers); +void *si_create_fixed_func_tcs(struct si_context *sctx); +void *si_create_dma_compute_shader(struct pipe_context *ctx, + unsigned num_dwords_per_thread, + bool dst_stream_cache_policy, bool is_copy); +void *si_create_copy_image_compute_shader(struct pipe_context *ctx); +void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx); +void *si_clear_render_target_shader(struct pipe_context *ctx); +void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx); +void *si_create_dcc_retile_cs(struct pipe_context *ctx); +void *si_create_query_result_cs(struct si_context *sctx); + /* si_test_dma.c */ void si_test_dma(struct si_screen *sscreen); +/* si_test_clearbuffer.c */ +void si_test_dma_perf(struct si_screen *sscreen); + /* si_uvd.c */ struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context, const struct pipe_video_codec *templ); @@ -1267,13 +1468,13 @@ void si_init_context_texture_functions(struct si_context *sctx); * common helpers */ -static inline struct r600_resource *r600_resource(struct pipe_resource *r) +static inline struct si_resource *si_resource(struct pipe_resource *r) { - return (struct r600_resource*)r; + return (struct si_resource*)r; } static inline void -r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res) +si_resource_reference(struct si_resource **ptr, struct si_resource *res) { pipe_resource_reference((struct pipe_resource **)ptr, (struct pipe_resource *)res); @@ -1300,13 +1501,24 @@ si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil) return tex->surface.u.legacy.tiling_index[level]; } +static inline unsigned +si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx) +{ + /* Don't count the needed CS space exactly and just use an upper bound. + * + * Also reserve space for stopping queries at the end of IB, because + * the number of active queries is unlimited in theory. + */ + return 2048 + sctx->num_cs_dw_queries_suspend; +} + static inline void si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r) { if (r) { /* Add memory usage for need_gfx_cs_space */ - sctx->vram += r600_resource(r)->vram_usage; - sctx->gtt += r600_resource(r)->gart_usage; + sctx->vram += si_resource(r)->vram_usage; + sctx->gtt += si_resource(r)->gart_usage; } } @@ -1314,6 +1526,7 @@ static inline void si_invalidate_draw_sh_constants(struct si_context *sctx) { sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN; + sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN; } static inline unsigned @@ -1409,7 +1622,7 @@ si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src) static inline void si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, - bool shaders_read_metadata) + bool shaders_read_metadata, bool dcc_pipe_aligned) { sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_INV_VMEM_L1; @@ -1419,12 +1632,13 @@ si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples, * L2 metadata must be flushed if shaders read metadata. * (DCC, CMASK). */ - if (num_samples >= 2) + if (num_samples >= 2 || + (shaders_read_metadata && !dcc_pipe_aligned)) sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; else if (shaders_read_metadata) sctx->flags |= SI_CONTEXT_INV_L2_METADATA; } else { - /* SI-CI-VI */ + /* GFX6-GFX8 */ sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; } } @@ -1446,7 +1660,7 @@ si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, else if (shaders_read_metadata) sctx->flags |= SI_CONTEXT_INV_L2_METADATA; } else { - /* SI-CI-VI */ + /* GFX6-GFX8 */ sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2; } } @@ -1551,15 +1765,15 @@ radeon_cs_memory_below_limit(struct si_screen *screen, */ static inline void radeon_add_to_buffer_list(struct si_context *sctx, struct radeon_cmdbuf *cs, - struct r600_resource *rbo, + struct si_resource *bo, enum radeon_bo_usage usage, enum radeon_bo_priority priority) { assert(usage); sctx->ws->cs_add_buffer( - cs, rbo->buf, + cs, bo->buf, (enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED), - rbo->domains, priority); + bo->domains, priority); } /** @@ -1581,18 +1795,23 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx, */ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx, - struct r600_resource *rbo, + struct si_resource *bo, enum radeon_bo_usage usage, enum radeon_bo_priority priority, bool check_mem) { if (check_mem && !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, - sctx->vram + rbo->vram_usage, - sctx->gtt + rbo->gart_usage)) + sctx->vram + bo->vram_usage, + sctx->gtt + bo->gart_usage)) si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority); + radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority); +} + +static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) +{ + return sctx->prim_discard_vertex_count_threshold != UINT_MAX; } #define PRINT_ERR(fmt, args...) \