X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_pipe.h;h=d32feab52c2614bb99dbc22895855c48f6b7db41;hb=c4c17ab3ec1d67b0f2fd9816681378bdc8efe220;hp=b6ef60cbe3eb561f77f7297ed4ebdc8b9d442423;hpb=166250f4e5486e1e44ed97a8ab2ee0691e41cfa1;p=mesa.git

diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b6ef60cbe3e..d32feab52c2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -39,7 +39,7 @@
 #endif
 
 #define ATI_VENDOR_ID			0x1002
-
+#define SI_PRIM_DISCARD_DEBUG		0
 #define SI_NOT_QUERY			0xffffffff
 
 /* The base vertex and primitive restart can be any number, but we must pick
@@ -47,11 +47,18 @@
  * the number shouldn't be a commonly-used one. */
 #define SI_BASE_VERTEX_UNKNOWN		INT_MIN
 #define SI_RESTART_INDEX_UNKNOWN	INT_MIN
+#define SI_INSTANCE_COUNT_UNKNOWN	INT_MIN
 #define SI_NUM_SMOOTH_AA_SAMPLES	8
+#define SI_MAX_POINT_SIZE		2048
 #define SI_GS_PER_ES			128
 /* Alignment for optimal CP DMA performance. */
 #define SI_CPDMA_ALIGNMENT		32
 
+/* Tunables for compute-based clear_buffer and copy_buffer: */
+#define SI_COMPUTE_CLEAR_DW_PER_THREAD	4
+#define SI_COMPUTE_COPY_DW_PER_THREAD	4
+#define SI_COMPUTE_DST_CACHE_POLICY	L2_STREAM
+
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS	(1 << 0)
 #define SI_CONTEXT_STOP_PIPELINE_STATS	(1 << 1)
@@ -65,7 +72,7 @@
 /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
 #define SI_CONTEXT_INV_GLOBAL_L2	(1 << 6)
 /* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
- * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */
+ * invalidate L2. GFX6-GFX7 can't do it, so they will do complete invalidation. */
 #define SI_CONTEXT_WRITEBACK_GLOBAL_L2	(1 << 7)
 /* Writeback & invalidate the L2 metadata cache. It can only be coupled with
  * a CB or DB flush. */
@@ -97,11 +104,26 @@
 
 #define SI_RESOURCE_FLAG_TRANSFER	(PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define SI_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
-#define SI_RESOURCE_FLAG_FORCE_TILING	(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
+#define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
 #define SI_RESOURCE_FLAG_DISABLE_DCC	(PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
 #define SI_RESOURCE_FLAG_UNMAPPABLE	(PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
 #define SI_RESOURCE_FLAG_READ_ONLY	(PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
 #define SI_RESOURCE_FLAG_32BIT		(PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
+#define SI_RESOURCE_FLAG_CLEAR		(PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA  (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
+
+enum si_clear_code
+{
+	DCC_CLEAR_COLOR_0000   = 0x00000000,
+	DCC_CLEAR_COLOR_0001   = 0x40404040,
+	DCC_CLEAR_COLOR_1110   = 0x80808080,
+	DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,
+	DCC_CLEAR_COLOR_REG    = 0x20202020,
+	DCC_UNCOMPRESSED       = 0xFFFFFFFF,
+};
+
+#define SI_IMAGE_ACCESS_AS_BUFFER	(1 << 7)
 
 /* Debug flags. */
 enum {
@@ -121,10 +143,10 @@ enum {
 	DBG_FS_CORRECT_DERIVS_AFTER_KILL,
 	DBG_UNSAFE_MATH,
 	DBG_SI_SCHED,
+	DBG_GISEL,
 
 	/* Shader compiler options (with no effect on the shader cache): */
 	DBG_CHECK_IR,
-	DBG_NIR,
 	DBG_MONOLITHIC_SHADERS,
 	DBG_NO_OPT_VARIANT,
 
@@ -143,6 +165,9 @@ enum {
 	DBG_ZERO_VRAM,
 
 	/* 3D engine options: */
+	DBG_ALWAYS_PD,
+	DBG_PD,
+	DBG_NO_PD,
 	DBG_SWITCH_ON_EOP,
 	DBG_NO_OUT_OF_ORDER,
 	DBG_NO_DPBB,
@@ -164,19 +189,37 @@ enum {
 	DBG_TEST_VMFAULT_CP,
 	DBG_TEST_VMFAULT_SDMA,
 	DBG_TEST_VMFAULT_SHADER,
+	DBG_TEST_DMA_PERF,
+	DBG_TEST_GDS,
+	DBG_TEST_GDS_MM,
+	DBG_TEST_GDS_OA_MM,
 };
 
 #define DBG_ALL_SHADERS		(((1 << (DBG_CS + 1)) - 1))
 #define DBG(name)		(1ull << DBG_##name)
 
+enum si_cache_policy {
+	L2_BYPASS,
+	L2_STREAM, /* same as SLC=1 */
+	L2_LRU,    /* same as SLC=0 */
+};
+
+enum si_coherency {
+	SI_COHERENCY_NONE, /* no cache flushes needed */
+	SI_COHERENCY_SHADER,
+	SI_COHERENCY_CB_META,
+	SI_COHERENCY_CP,
+};
+
 struct si_compute;
+struct si_shader_context;
 struct hash_table;
 struct u_suballocator;
 
 /* Only 32-bit buffer allocations are supported, gallium doesn't support more
  * at the moment.
  */
-struct r600_resource {
+struct si_resource {
 	struct threaded_resource	b;
 
 	/* Winsys objects. */
@@ -224,35 +267,39 @@ struct r600_resource {
 	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
 };
 
-struct r600_transfer {
+struct si_transfer {
 	struct threaded_transfer	b;
-	struct r600_resource		*staging;
+	struct si_resource		*staging;
 	unsigned			offset;
 };
 
-struct r600_cmask_info {
-	uint64_t offset;
-	uint64_t base_address_reg;
-	uint32_t size;
-};
-
 struct si_texture {
-	struct r600_resource		buffer;
+	struct si_resource		buffer;
 
 	struct radeon_surf		surface;
 	uint64_t			size;
 	struct si_texture		*flushed_depth_texture;
 
-	/* Colorbuffer compression and fast clear. */
+	/* One texture allocation can contain these buffers:
+	 * - image (pixel data)
+	 * - FMASK buffer (MSAA compression)
+	 * - CMASK buffer (MSAA compression and/or legacy fast color clear)
+	 * - HTILE buffer (Z/S compression and fast Z/S clear)
+	 * - DCC buffer (color compression and new fast color clear)
+	 * - displayable DCC buffer (if the DCC buffer is not displayable)
+	 * - DCC retile mapping buffer (if the DCC buffer is not displayable)
+	 */
 	uint64_t			fmask_offset;
-	struct r600_cmask_info		cmask;
-	struct r600_resource		*cmask_buffer;
+	uint64_t			cmask_offset;
+	uint64_t			cmask_base_address_reg;
+	struct si_resource		*cmask_buffer;
 	uint64_t			dcc_offset; /* 0 = disabled */
+	uint64_t			display_dcc_offset;
+	uint64_t			dcc_retile_map_offset;
 	unsigned			cb_color_info; /* fast clear enable bit */
 	unsigned			color_clear_value[2];
 	unsigned			last_msaa_resolve_target_micro_mode;
 	unsigned			num_level0_transfers;
-	unsigned			num_color_samples;
 
 	/* Depth buffer compression and fast clear. */
 	uint64_t			htile_offset;
@@ -290,9 +337,9 @@ struct si_texture {
 	 * target == 2D and last_level == 0. If enabled, dcc_offset contains
 	 * the absolute GPUVM address, not the relative one.
 	 */
-	struct r600_resource		*dcc_separate_buffer;
+	struct si_resource		*dcc_separate_buffer;
 	/* When DCC is temporarily disabled, the separate buffer is here. */
-	struct r600_resource		*last_dcc_separate_buffer;
+	struct si_resource		*last_dcc_separate_buffer;
 	/* Estimate of how much this color buffer is written to in units of
 	 * full-screen draws: ps_invocations / (width * height)
 	 * Shader kills, late Z, and blending with trivial discards make it
@@ -323,7 +370,7 @@ struct si_surface {
 	unsigned cb_color_view;
 	unsigned cb_color_attrib;
 	unsigned cb_color_attrib2;	/* GFX9 and later */
-	unsigned cb_dcc_control;	/* VI and later */
+	unsigned cb_dcc_control;	/* GFX8 and later */
 	unsigned spi_shader_col_format:8;	/* no blending, no alpha-to-coverage. */
 	unsigned spi_shader_col_format_alpha:8;	/* alpha-to-coverage */
 	unsigned spi_shader_col_format_blend:8;	/* blending without alpha. */
@@ -408,6 +455,9 @@ struct si_screen {
 	uint64_t			debug_flags;
 	char				renderer_string[183];
 
+	unsigned			pa_sc_raster_config;
+	unsigned			pa_sc_raster_config_1;
+	unsigned			se_tile_repeat;
 	unsigned			gs_table_depth;
 	unsigned			tess_offchip_block_dw_size;
 	unsigned			tess_offchip_ring_size;
@@ -422,13 +472,19 @@ struct si_screen {
 	bool				has_out_of_order_rast;
 	bool				assume_no_z_fights;
 	bool				commutative_blend_add;
-	bool				clear_db_cache_before_clear;
+	bool				has_gfx9_scissor_bug;
 	bool				has_msaa_sample_loc_bug;
 	bool				has_ls_vgpr_init_bug;
+	bool				has_dcc_constant_encode;
 	bool				dpbb_allowed;
 	bool				dfsm_allowed;
 	bool				llvm_has_working_vgpr_indexing;
 
+	struct {
+#define OPT_BOOL(name, dflt, description) bool name:1;
+#include "si_debug_options.h"
+	} options;
+
 	/* Whether shaders are monolithic (1-part) or separate (3-part). */
 	bool				use_monolithic_shaders;
 	bool				record_llvm_ir;
@@ -474,6 +530,7 @@ struct si_screen {
 	 * the counter before drawing and re-emit the states accordingly.
 	 */
 	unsigned			dirty_tex_counter;
+	unsigned			dirty_buf_counter;
 
 	/* Atomically increment this counter when an existing texture's
 	 * metadata is enabled or disabled in a way that requires changing
@@ -520,12 +577,12 @@ struct si_screen {
 	/* Use at most 3 normal compiler threads on quadcore and better.
 	 * Hyperthreaded CPUs report the number of threads, but we want
 	 * the number of cores. We only need this many threads for shader-db. */
-	struct si_compiler		compiler[24]; /* used by the queue only */
+	struct ac_llvm_compiler		compiler[24]; /* used by the queue only */
 
 	struct util_queue		shader_compiler_queue_low_priority;
 	/* Use at most 2 low priority threads on quadcore and better.
 	 * We want to minimize the impact on multithreaded Mesa. */
-	struct si_compiler		compiler_lowp[10];
+	struct ac_llvm_compiler		compiler_lowp[10];
 };
 
 struct si_blend_color {
@@ -550,7 +607,7 @@ struct si_sampler_view {
 #define SI_SAMPLER_STATE_MAGIC 0x34f1c35a
 
 struct si_sampler_state {
-#ifdef DEBUG
+#ifndef NDEBUG
 	unsigned			magic;
 #endif
 	uint32_t			val[4];
@@ -597,10 +654,19 @@ struct si_framebuffer {
 	ubyte				color_is_int8;
 	ubyte				color_is_int10;
 	ubyte				dirty_cbufs;
+	ubyte				dcc_overwrite_combiner_watermark;
 	bool				dirty_zsbuf;
 	bool				any_dst_linear;
 	bool				CB_has_shader_readable_metadata;
 	bool				DB_has_shader_readable_metadata;
+	bool				all_DCC_pipe_aligned;
+};
+
+enum si_quant_mode {
+	/* This is the list we want to support. */
+	SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH,
+	SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH,
+	SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH,
 };
 
 struct si_signed_scissor {
@@ -608,18 +674,13 @@ struct si_signed_scissor {
 	int miny;
 	int maxx;
 	int maxy;
-};
-
-struct si_scissors {
-	unsigned			dirty_mask;
-	struct pipe_scissor_state	states[SI_MAX_VIEWPORTS];
+	enum si_quant_mode quant_mode;
 };
 
 struct si_viewports {
-	unsigned			dirty_mask;
-	unsigned			depth_range_dirty_mask;
 	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
 	struct si_signed_scissor	as_scissor[SI_MAX_VIEWPORTS];
+	bool				y_inverted;
 };
 
 struct si_clip_state {
@@ -631,7 +692,7 @@ struct si_streamout_target {
 	struct pipe_stream_output_target b;
 
 	/* The buffer where BUFFER_FILLED_SIZE is stored. */
-	struct r600_resource	*buf_filled_size;
+	struct si_resource	*buf_filled_size;
 	unsigned		buf_filled_size_offset;
 	bool			buf_filled_size_valid;
 
@@ -725,14 +786,24 @@ struct si_saved_cs {
 	struct pipe_reference	reference;
 	struct si_context	*ctx;
 	struct radeon_saved_cs	gfx;
-	struct r600_resource	*trace_buf;
+	struct radeon_saved_cs	compute;
+	struct si_resource	*trace_buf;
 	unsigned		trace_id;
 
 	unsigned		gfx_last_dw;
+	unsigned		compute_last_dw;
 	bool			flushed;
 	int64_t			time_flush;
 };
 
+struct si_sdma_upload {
+	struct si_resource	*dst;
+	struct si_resource	*src;
+	unsigned		src_offset;
+	unsigned		dst_offset;
+	unsigned		size;
+};
+
 struct si_context {
 	struct pipe_context		b; /* base class */
 
@@ -741,11 +812,11 @@ struct si_context {
 
 	struct radeon_winsys		*ws;
 	struct radeon_winsys_ctx	*ctx;
-	struct radeon_cmdbuf		*gfx_cs;
+	struct radeon_cmdbuf		*gfx_cs; /* compute IB if graphics is disabled */
 	struct radeon_cmdbuf		*dma_cs;
 	struct pipe_fence_handle	*last_gfx_fence;
 	struct pipe_fence_handle	*last_sdma_fence;
-	struct r600_resource		*eop_bug_scratch;
+	struct si_resource		*eop_bug_scratch;
 	struct u_upload_mgr		*cached_gtt_allocator;
 	struct threaded_context		*tc;
 	struct u_suballocator		*allocator_zeroed_memory;
@@ -765,22 +836,31 @@ struct si_context {
 	void				*vs_blit_color;
 	void				*vs_blit_color_layered;
 	void				*vs_blit_texcoord;
+	void				*cs_clear_buffer;
+	void				*cs_copy_buffer;
+	void				*cs_copy_image;
+	void				*cs_copy_image_1d_array;
+	void				*cs_clear_render_target;
+	void				*cs_clear_render_target_1d_array;
+	void				*cs_dcc_retile;
 	struct si_screen		*screen;
 	struct pipe_debug_callback	debug;
-	struct si_compiler		compiler; /* only non-threaded compilation */
+	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
-	struct r600_resource		*wait_mem_scratch;
+	/* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
+	struct si_resource		*wait_mem_scratch;
 	unsigned			wait_mem_number;
 	uint16_t			prefetch_L2_mask;
 
+	bool				has_graphics;
 	bool				gfx_flush_in_progress:1;
 	bool				gfx_last_ib_is_busy:1;
 	bool				compute_is_busy:1;
 
 	unsigned			num_gfx_cs_flushes;
 	unsigned			initial_gfx_cs_size;
-	unsigned			gpu_reset_counter;
 	unsigned			last_dirty_tex_counter;
+	unsigned			last_dirty_buf_counter;
 	unsigned			last_compressed_colortex_counter;
 	unsigned			last_num_draw_calls;
 	unsigned			flags; /* flush flags */
@@ -788,6 +868,31 @@ struct si_context {
 	uint64_t			vram;
 	uint64_t			gtt;
 
+	/* Compute-based primitive discard. */
+	unsigned			prim_discard_vertex_count_threshold;
+	struct pb_buffer		*gds;
+	struct pb_buffer		*gds_oa;
+	struct radeon_cmdbuf		*prim_discard_compute_cs;
+	unsigned			compute_gds_offset;
+	struct si_shader		*compute_ib_last_shader;
+	uint32_t			compute_rewind_va;
+	unsigned			compute_num_prims_in_batch;
+	bool				preserve_prim_restart_gds_at_flush;
+	/* index_ring is divided into 2 halves for doublebuffering. */
+	struct si_resource		*index_ring;
+	unsigned			index_ring_base; /* offset of a per-IB portion */
+	unsigned			index_ring_offset; /* offset within a per-IB portion */
+	unsigned			index_ring_size_per_ib; /* max available size per IB */
+	bool				prim_discard_compute_ib_initialized;
+	/* For tracking the last execution barrier - it can be either
+	 * a WRITE_DATA packet or a fence. */
+	uint32_t			*last_pkt3_write_data;
+	struct si_resource		*barrier_buf;
+	unsigned			barrier_buf_offset;
+	struct pipe_fence_handle	*last_ib_barrier_fence;
+	struct si_resource		*last_ib_barrier_buf;
+	unsigned			last_ib_barrier_buf_offset;
+
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
 	unsigned			dirty_atoms; /* mask */
@@ -805,9 +910,12 @@ struct si_context {
 	struct si_clip_state		clip_state;
 	struct si_shader_data		shader_pointers;
 	struct si_stencil_ref		stencil_ref;
-	struct si_scissors		scissors;
+	struct pipe_scissor_state	scissors[SI_MAX_VIEWPORTS];
 	struct si_streamout		streamout;
 	struct si_viewports		viewports;
+	unsigned			num_window_rectangles;
+	bool				window_rectangles_include;
+	struct pipe_scissor_state	window_rectangles[4];
 
 	/* Precomputed states. */
 	struct si_pm4_state		*init_config;
@@ -821,17 +929,19 @@ struct si_context {
 	struct si_shader_ctx_state	vs_shader;
 	struct si_shader_ctx_state	tcs_shader;
 	struct si_shader_ctx_state	tes_shader;
+	struct si_shader_ctx_state	cs_prim_discard_state;
 	struct si_cs_shader_state	cs_shader_state;
 
 	/* shader information */
 	struct si_vertex_elements	*vertex_elements;
 	unsigned			sprite_coord_enable;
+	unsigned			cs_max_waves_per_sh;
 	bool				flatshade;
 	bool				do_update_shaders;
 
 	/* vertex buffer descriptors */
 	uint32_t *vb_descriptors_gpu_list;
-	struct r600_resource *vb_descriptors_buffer;
+	struct si_resource *vb_descriptors_buffer;
 	unsigned vb_descriptors_offset;
 
 	/* shader descriptors */
@@ -843,23 +953,28 @@ struct si_context {
 	struct si_buffer_resources	const_and_shader_buffers[SI_NUM_SHADERS];
 	struct si_samplers		samplers[SI_NUM_SHADERS];
 	struct si_images		images[SI_NUM_SHADERS];
+	bool				bo_list_add_all_resident_resources;
+	bool				bo_list_add_all_gfx_resources;
+	bool				bo_list_add_all_compute_resources;
 
 	/* other shader resources */
-	struct pipe_constant_buffer	null_const_buf; /* used for set_constant_buffer(NULL) on CIK */
+	struct pipe_constant_buffer	null_const_buf; /* used for set_constant_buffer(NULL) on GFX7 */
 	struct pipe_resource		*esgs_ring;
 	struct pipe_resource		*gsvs_ring;
 	struct pipe_resource		*tess_rings;
 	union pipe_color_union		*border_color_table; /* in CPU memory, any endian */
-	struct r600_resource		*border_color_buffer;
+	struct si_resource		*border_color_buffer;
 	union pipe_color_union		*border_color_map; /* in VRAM (slow access), little endian */
 	unsigned			border_color_count;
 	unsigned			num_vs_blit_sgprs;
 	uint32_t			vs_blit_sh_data[SI_VS_BLIT_SGPRS_POS_TEXCOORD];
+	uint32_t			cs_user_data[4];
 
 	/* Vertex and index buffers. */
 	bool				vertex_buffers_dirty;
 	bool				vertex_buffer_pointer_dirty;
 	struct pipe_vertex_buffer	vertex_buffer[SI_NUM_VERTEX_BUFFERS];
+	uint16_t			vertex_buffer_unaligned; /* bitmask of not dword-aligned buffers */
 
 	/* MSAA config state. */
 	int				ps_iter_samples;
@@ -883,9 +998,11 @@ struct si_context {
 	/* Emitted draw state. */
 	bool			gs_tri_strip_adj_fix:1;
 	bool			ls_vgpr_fix:1;
+	bool			prim_discard_cs_instancing:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
+	int			last_instance_count;
 	int			last_drawid;
 	int			last_sh_base_reg;
 	int			last_primitive_restart_en;
@@ -899,11 +1016,11 @@ struct si_context {
 	enum pipe_prim_type	current_rast_prim; /* primitive type after TES, GS */
 
 	/* Scratch buffer */
-	struct r600_resource	*scratch_buffer;
+	struct si_resource	*scratch_buffer;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
 
-	struct r600_resource	*compute_scratch_buffer;
+	struct si_resource	*compute_scratch_buffer;
 
 	/* Emitted derived tessellation state. */
 	/* Local shader (VS), or HS if LS-HS are merged. */
@@ -960,11 +1077,14 @@ struct si_context {
 	/* MSAA sample locations.
 	 * The first index is the sample index.
 	 * The second index is the coordinate: X, Y. */
-	float			sample_locations_1x[1][2];
-	float			sample_locations_2x[2][2];
-	float			sample_locations_4x[4][2];
-	float			sample_locations_8x[8][2];
-	float			sample_locations_16x[16][2];
+	struct {
+		float			x1[1][2];
+		float			x2[2][2];
+		float			x4[4][2];
+		float			x8[8][2];
+		float			x16[16][2];
+	} sample_positions;
+	struct pipe_resource *sample_pos_buffer;
 
 	/* Misc stats. */
 	unsigned			num_draw_calls;
@@ -986,11 +1106,16 @@ struct si_context {
 	unsigned			num_resident_handles;
 	uint64_t			num_alloc_tex_transfer_bytes;
 	unsigned			last_tex_ps_draw_ratio; /* for query */
+	unsigned			compute_num_verts_accepted;
+	unsigned			compute_num_verts_rejected;
+	unsigned			compute_num_verts_ineligible; /* due to low vertex count */
+	unsigned			context_roll;
 
 	/* Queries. */
 	/* Maintain the list of active queries for pausing between IBs. */
 	int				num_occlusion_queries;
 	int				num_perfect_occlusion_queries;
+	int				num_pipeline_stat_queries;
 	struct list_head		active_queries;
 	unsigned			num_cs_dw_queries_suspend;
 
@@ -1000,6 +1125,12 @@ struct si_context {
 	bool				render_cond_invert;
 	bool				render_cond_force_off; /* for u_blitter */
 
+	/* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
+	bool				sdma_uploads_in_progress;
+	struct si_sdma_upload		*sdma_uploads;
+	unsigned			num_sdma_uploads;
+	unsigned			max_sdma_uploads;
+
 	/* Statistics gathering for the DCC enablement heuristic. It can't be
 	 * in si_texture because si_texture can be shared by multiple
 	 * contexts. This is for back buffers only. We shouldn't get too many
@@ -1028,9 +1159,6 @@ struct si_context {
 			 unsigned src_level,
 			 const struct pipe_box *src_box);
 
-	void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst,
-				 uint64_t offset, uint64_t size, unsigned value);
-
 	struct si_tracked_regs			tracked_regs;
 };
 
@@ -1070,17 +1198,17 @@ bool si_rings_is_buffer_referenced(struct si_context *sctx,
 				   struct pb_buffer *buf,
 				   enum radeon_bo_usage usage);
 void *si_buffer_map_sync_with_rings(struct si_context *sctx,
-				    struct r600_resource *resource,
+				    struct si_resource *resource,
 				    unsigned usage);
 void si_init_resource_fields(struct si_screen *sscreen,
-			     struct r600_resource *res,
+			     struct si_resource *res,
 			     uint64_t size, unsigned alignment);
 bool si_alloc_resource(struct si_screen *sscreen,
-		       struct r600_resource *res);
+		       struct si_resource *res);
 struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
 						 unsigned flags, unsigned usage,
 						 unsigned size, unsigned alignment);
-struct r600_resource *si_aligned_buffer_create(struct pipe_screen *screen,
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
 					       unsigned flags, unsigned usage,
 					       unsigned size, unsigned alignment);
 void si_replace_buffer_storage(struct pipe_context *ctx,
@@ -1097,6 +1225,32 @@ void vi_dcc_clear_level(struct si_context *sctx,
 			unsigned level, unsigned clear_value);
 void si_init_clear_functions(struct si_context *sctx);
 
+/* si_compute_blit.c */
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+			    enum si_cache_policy cache_policy);
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+		     uint64_t offset, uint64_t size, uint32_t *clear_value,
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma);
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_compute_copy_image(struct si_context *sctx,
+			   struct pipe_resource *dst,
+			   unsigned dst_level,
+			   struct pipe_resource *src,
+			   unsigned src_level,
+			   unsigned dstx, unsigned dsty, unsigned dstz,
+			   const struct pipe_box *src_box);
+void si_compute_clear_render_target(struct pipe_context *ctx,
+                                    struct pipe_surface *dstsurf,
+                                    const union pipe_color_union *color,
+                                    unsigned dstx, unsigned dsty,
+                                    unsigned width, unsigned height,
+				    bool render_condition_enabled);
+void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
+void si_init_compute_blit_functions(struct si_context *sctx);
+
 /* si_cp_dma.c */
 #define SI_CPDMA_SKIP_CHECK_CS_SPACE	(1 << 0) /* don't call need_cs_space */
 #define SI_CPDMA_SKIP_SYNC_AFTER	(1 << 1) /* don't wait for DMA after the copy */
@@ -1109,24 +1263,26 @@ void si_init_clear_functions(struct si_context *sctx);
 			   SI_CPDMA_SKIP_GFX_SYNC | \
 			   SI_CPDMA_SKIP_BO_LIST_UPDATE)
 
-enum si_coherency {
-	SI_COHERENCY_NONE, /* no cache flushes needed */
-	SI_COHERENCY_SHADER,
-	SI_COHERENCY_CB_META,
-};
-
 void si_cp_dma_wait_for_idle(struct si_context *sctx);
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-		     uint64_t offset, uint64_t size, unsigned value,
-		     enum si_coherency coher);
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
-		    unsigned user_flags);
+void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
+			    struct pipe_resource *dst, uint64_t offset,
+			    uint64_t size, unsigned value, unsigned user_flags,
+			    enum si_coherency coher, enum si_cache_policy cache_policy);
+void si_cp_dma_copy_buffer(struct si_context *sctx,
+			   struct pipe_resource *dst, struct pipe_resource *src,
+			   uint64_t dst_offset, uint64_t src_offset, unsigned size,
+			   unsigned user_flags, enum si_coherency coher,
+			   enum si_cache_policy cache_policy);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
 			      uint64_t offset, unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
-void si_init_cp_dma_functions(struct si_context *sctx);
+void si_test_gds(struct si_context *sctx);
+void si_cp_write_data(struct si_context *sctx, struct si_resource *buf,
+		      unsigned offset, unsigned size, unsigned dst_sel,
+		      unsigned engine, const void *data);
+void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs,
+		     unsigned dst_sel, struct si_resource *dst, unsigned dst_offset,
+		     unsigned src_sel, struct si_resource *src, unsigned src_offset);
 
 /* si_debug.c */
 void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
@@ -1140,35 +1296,38 @@ void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
 void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct si_context *sctx,
 			struct radeon_saved_cs *saved, enum ring_type ring);
-bool si_replace_shader(unsigned num, struct ac_shader_binary *binary);
+bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
 
 /* si_dma.c */
 void si_init_dma_functions(struct si_context *sctx);
 
 /* si_dma_cs.c */
+void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
+			   uint64_t offset);
+void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+			  uint64_t offset, uint64_t size, unsigned clear_value);
 void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
-		       struct r600_resource *dst, struct r600_resource *src);
+		       struct si_resource *dst, struct si_resource *src);
 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence);
 void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
 			    uint64_t offset, uint64_t size, unsigned value);
 
 /* si_fence.c */
-void si_gfx_write_event_eop(struct si_context *ctx,
-			    unsigned event, unsigned event_flags,
-			    unsigned data_sel,
-			    struct r600_resource *buf, uint64_t va,
-			    uint32_t new_fence, unsigned query_type);
-unsigned si_gfx_write_fence_dwords(struct si_screen *screen);
-void si_gfx_wait_fence(struct si_context *ctx,
-		       uint64_t va, uint32_t ref, uint32_t mask);
+void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
+		       unsigned event, unsigned event_flags,
+		       unsigned dst_sel, unsigned int_sel, unsigned data_sel,
+		       struct si_resource *buf, uint64_t va,
+		       uint32_t new_fence, unsigned query_type);
+unsigned si_cp_write_fence_dwords(struct si_screen *screen);
+void si_cp_wait_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
+		      uint64_t va, uint32_t ref, uint32_t mask, unsigned flags);
 void si_init_fence_functions(struct si_context *ctx);
 void si_init_screen_fence_functions(struct si_screen *screen);
 struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
 					  struct tc_unflushed_batch_token *tc_token);
 
 /* si_get.c */
-const char *si_get_family_name(const struct si_screen *sscreen);
 void si_init_screen_get_functions(struct si_screen *sscreen);
 
 /* si_gfx_cs.c */
@@ -1176,21 +1335,46 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence);
 void si_begin_new_gfx_cs(struct si_context *ctx);
 void si_need_gfx_cs_space(struct si_context *ctx);
+void si_unref_sdma_uploads(struct si_context *sctx);
 
-/* r600_gpu_load.c */
+/* si_gpu_load.c */
 void si_gpu_load_kill_thread(struct si_screen *sscreen);
 uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
 unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
 			uint64_t begin);
 
 /* si_compute.c */
+void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs);
+unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
+					unsigned waves_per_threadgroup,
+					unsigned max_waves_per_sh,
+					unsigned threadgroups_per_cu);
 void si_init_compute_functions(struct si_context *sctx);
 
-/* r600_perfcounters.c */
-void si_perfcounters_destroy(struct si_screen *sscreen);
+/* si_compute_prim_discard.c */
+enum si_prim_discard_outcome {
+	SI_PRIM_DISCARD_ENABLED,
+	SI_PRIM_DISCARD_DISABLED,
+	SI_PRIM_DISCARD_DRAW_SPLIT,
+};
+
+void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
+enum si_prim_discard_outcome
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
+				      const struct pipe_draw_info *info,
+				      bool primitive_restart);
+void si_compute_signal_gfx(struct si_context *sctx);
+void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
+					  const struct pipe_draw_info *info,
+					  unsigned index_size,
+					  unsigned base_vertex,
+					  uint64_t input_indexbuf_va,
+					  unsigned input_indexbuf_max_elements);
+void si_initialize_prim_discard_tunables(struct si_context *sctx);
 
 /* si_perfcounters.c */
 void si_init_perfcounters(struct si_screen *screen);
+void si_destroy_perfcounters(struct si_screen *screen);
 
 /* si_pipe.c */
 bool si_check_device_reset(struct si_context *sctx);
@@ -1201,9 +1385,26 @@ void si_init_query_functions(struct si_context *sctx);
 void si_suspend_queries(struct si_context *sctx);
 void si_resume_queries(struct si_context *sctx);
 
+/* si_shaderlib_tgsi.c */
+void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
+			unsigned num_layers);
+void *si_create_fixed_func_tcs(struct si_context *sctx);
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+				   unsigned num_dwords_per_thread,
+				   bool dst_stream_cache_policy, bool is_copy);
+void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
+void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
+void *si_clear_render_target_shader(struct pipe_context *ctx);
+void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx);
+void *si_create_dcc_retile_cs(struct pipe_context *ctx);
+void *si_create_query_result_cs(struct si_context *sctx);
+
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
 
+/* si_test_clearbuffer.c */
+void si_test_dma_perf(struct si_screen *sscreen);
+
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
 					       const struct pipe_video_codec *templ);
@@ -1267,13 +1468,13 @@ void si_init_context_texture_functions(struct si_context *sctx);
  * common helpers
  */
 
-static inline struct r600_resource *r600_resource(struct pipe_resource *r)
+static inline struct si_resource *si_resource(struct pipe_resource *r)
 {
-	return (struct r600_resource*)r;
+	return (struct si_resource*)r;
 }
 
 static inline void
-r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
+si_resource_reference(struct si_resource **ptr, struct si_resource *res)
 {
 	pipe_resource_reference((struct pipe_resource **)ptr,
 				(struct pipe_resource *)res);
@@ -1300,13 +1501,24 @@ si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
 		return tex->surface.u.legacy.tiling_index[level];
 }
 
+static inline unsigned
+si_get_minimum_num_gfx_cs_dwords(struct si_context *sctx)
+{
+	/* Don't count the needed CS space exactly and just use an upper bound.
+	 *
+	 * Also reserve space for stopping queries at the end of IB, because
+	 * the number of active queries is unlimited in theory.
+	 */
+	return 2048 + sctx->num_cs_dw_queries_suspend;
+}
+
 static inline void
 si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
 {
 	if (r) {
 		/* Add memory usage for need_gfx_cs_space */
-		sctx->vram += r600_resource(r)->vram_usage;
-		sctx->gtt += r600_resource(r)->gart_usage;
+		sctx->vram += si_resource(r)->vram_usage;
+		sctx->gtt += si_resource(r)->gart_usage;
 	}
 }
 
@@ -1314,6 +1526,7 @@ static inline void
 si_invalidate_draw_sh_constants(struct si_context *sctx)
 {
 	sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
+	sctx->last_instance_count = SI_INSTANCE_COUNT_UNKNOWN;
 }
 
 static inline unsigned
@@ -1409,7 +1622,7 @@ si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
 
 static inline void
 si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
-			   bool shaders_read_metadata)
+			   bool shaders_read_metadata, bool dcc_pipe_aligned)
 {
 	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
 		       SI_CONTEXT_INV_VMEM_L1;
@@ -1419,12 +1632,13 @@ si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
 		 * L2 metadata must be flushed if shaders read metadata.
 		 * (DCC, CMASK).
 		 */
-		if (num_samples >= 2)
+		if (num_samples >= 2 ||
+		    (shaders_read_metadata && !dcc_pipe_aligned))
 			sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
 		else if (shaders_read_metadata)
 			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
 	} else {
-		/* SI-CI-VI */
+		/* GFX6-GFX8 */
 		sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
 	}
 }
@@ -1446,7 +1660,7 @@ si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
 		else if (shaders_read_metadata)
 			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
 	} else {
-		/* SI-CI-VI */
+		/* GFX6-GFX8 */
 		sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
 	}
 }
@@ -1551,15 +1765,15 @@ radeon_cs_memory_below_limit(struct si_screen *screen,
  */
 static inline void radeon_add_to_buffer_list(struct si_context *sctx,
 					     struct radeon_cmdbuf *cs,
-					     struct r600_resource *rbo,
+					     struct si_resource *bo,
 					     enum radeon_bo_usage usage,
 					     enum radeon_bo_priority priority)
 {
 	assert(usage);
 	sctx->ws->cs_add_buffer(
-		cs, rbo->buf,
+		cs, bo->buf,
 		(enum radeon_bo_usage)(usage | RADEON_USAGE_SYNCHRONIZED),
-		rbo->domains, priority);
+		bo->domains, priority);
 }
 
 /**
@@ -1581,18 +1795,23 @@ static inline void radeon_add_to_buffer_list(struct si_context *sctx,
  */
 static inline void
 radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
-					struct r600_resource *rbo,
+					struct si_resource *bo,
 					enum radeon_bo_usage usage,
 					enum radeon_bo_priority priority,
 					bool check_mem)
 {
 	if (check_mem &&
 	    !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs,
-					  sctx->vram + rbo->vram_usage,
-					  sctx->gtt + rbo->gart_usage))
+					  sctx->vram + bo->vram_usage,
+					  sctx->gtt + bo->gart_usage))
 		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 
-	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority);
+	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
+}
+
+static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
+{
+	return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
 }
 
 #define PRINT_ERR(fmt, args...) \