X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Ffreedreno%2Fvulkan%2Ftu_private.h;h=590ce8971085e2f3201eecea8ba17391f2f063f0;hb=aab3398b33779be8b7e4f70edf039fcf5ebd06e4;hp=1c8bb87e47b7bfaa208795d01fc9e13022b9fa7c;hpb=24af64baa531f9ac490bc5de433b2b2b52a3ccee;p=mesa.git diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index 1c8bb87e47b..590ce897108 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -40,11 +40,10 @@ #include #define VG(x) x #else -#define VG(x) +#define VG(x) ((void)0) #endif #include "c11/threads.h" -#include "compiler/shader_enums.h" #include "main/macros.h" #include "util/list.h" #include "util/macros.h" @@ -52,13 +51,14 @@ #include "vk_debug_report.h" #include "wsi_common.h" -#include "drm/msm_drm.h" +#include "drm-uapi/msm_drm.h" #include "ir3/ir3_compiler.h" #include "ir3/ir3_shader.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" #include "a6xx.xml.h" +#include "fdl/freedreno_layout.h" #include "tu_descriptor_set.h" #include "tu_extensions.h" @@ -77,6 +77,8 @@ typedef uint32_t xcb_window_t; #include "tu_entrypoints.h" +#include "vk_format.h" + #define MAX_VBS 32 #define MAX_VERTEX_ATTRIBS 32 #define MAX_RTS 8 @@ -90,85 +92,23 @@ typedef uint32_t xcb_window_t; #define MAX_DYNAMIC_STORAGE_BUFFERS 8 #define MAX_DYNAMIC_BUFFERS \ (MAX_DYNAMIC_UNIFORM_BUFFERS + MAX_DYNAMIC_STORAGE_BUFFERS) -#define MAX_SAMPLES_LOG2 4 -#define NUM_META_FS_KEYS 13 #define TU_MAX_DRM_DEVICES 8 #define MAX_VIEWS 8 - -#define NUM_DEPTH_CLEAR_PIPELINES 3 - -/* - * This is the point we switch from using CP to compute shader - * for certain buffer operations. +#define MAX_BIND_POINTS 2 /* compute + graphics */ +/* The Qualcomm driver exposes 0x20000058 */ +#define MAX_STORAGE_BUFFER_RANGE 0x20000000 +/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so + * expose the same maximum range. + * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual + * range might be higher. */ -#define TU_BUFFER_OPS_CS_THRESHOLD 4096 +#define MAX_UNIFORM_BUFFER_RANGE 0x10000 -enum tu_mem_heap -{ - TU_MEM_HEAP_VRAM, - TU_MEM_HEAP_VRAM_CPU_ACCESS, - TU_MEM_HEAP_GTT, - TU_MEM_HEAP_COUNT -}; - -enum tu_mem_type -{ - TU_MEM_TYPE_VRAM, - TU_MEM_TYPE_GTT_WRITE_COMBINE, - TU_MEM_TYPE_VRAM_CPU_ACCESS, - TU_MEM_TYPE_GTT_CACHED, - TU_MEM_TYPE_COUNT -}; +#define A6XX_TEX_CONST_DWORDS 16 +#define A6XX_TEX_SAMP_DWORDS 4 #define tu_printflike(a, b) __attribute__((__format__(__printf__, a, b))) -static inline uint32_t -align_u32(uint32_t v, uint32_t a) -{ - assert(a != 0 && a == (a & -a)); - return (v + a - 1) & ~(a - 1); -} - -static inline uint32_t -align_u32_npot(uint32_t v, uint32_t a) -{ - return (v + a - 1) / a * a; -} - -static inline uint64_t -align_u64(uint64_t v, uint64_t a) -{ - assert(a != 0 && a == (a & -a)); - return (v + a - 1) & ~(a - 1); -} - -static inline int32_t -align_i32(int32_t v, int32_t a) -{ - assert(a != 0 && a == (a & -a)); - return (v + a - 1) & ~(a - 1); -} - -/** Alignment must be a power of 2. */ -static inline bool -tu_is_aligned(uintmax_t n, uintmax_t a) -{ - assert(a == (a & -a)); - return (n & (a - 1)) == 0; -} - -static inline uint32_t -round_up_u32(uint32_t v, uint32_t a) -{ - return (v + a - 1) / a; -} - -static inline uint64_t -round_up_u64(uint64_t v, uint64_t a) -{ - return (v + a - 1) / a; -} - static inline uint32_t tu_minify(uint32_t n, uint32_t levels) { @@ -177,29 +117,6 @@ tu_minify(uint32_t n, uint32_t levels) else return MAX2(n >> levels, 1); } -static inline float -tu_clamp_f(float f, float min, float max) -{ - assert(min < max); - - if (f > max) - return max; - else if (f < min) - return min; - else - return f; -} - -static inline bool -tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask) -{ - if (*inout_mask & clear_mask) { - *inout_mask &= ~clear_mask; - return true; - } else { - return false; - } -} #define for_each_bit(b, dword) \ for (uint32_t __dword = (dword); \ @@ -211,6 +128,8 @@ tu_clear_mask(uint32_t *inout_mask, uint32_t clear_mask) memcpy((dest), (src), (count) * sizeof(*(src))); \ }) +#define COND(bool, val) ((bool) ? (val) : 0) + /* Whenever we generate an error, pass it through this function. Useful for * debugging, where we can break on it. Only call at error site, not when * propagating errors. Might be useful to plug in a stack trace here. @@ -237,11 +156,7 @@ __tu_finishme(const char *file, int line, const char *format, ...) void tu_loge(const char *format, ...) tu_printflike(1, 2); void -tu_loge_v(const char *format, va_list va); -void tu_logi(const char *format, ...) tu_printflike(1, 2); -void -tu_logi_v(const char *format, va_list va); /** * Print a FINISHME message, including its source location. @@ -255,17 +170,6 @@ tu_logi_v(const char *format, va_list va); } \ } while (0) -/* A non-fatal assert. Useful for debugging. */ -#ifdef DEBUG -#define tu_assert(x) \ - ({ \ - if (unlikely(!(x))) \ - fprintf(stderr, "%s:%d ASSERT: %s\n", __FILE__, __LINE__, #x); \ - }) -#else -#define tu_assert(x) -#endif - /* Suppress -Wunused in stub functions */ #define tu_use_args(...) __tu_use_args(0, ##__VA_ARGS__) static inline void @@ -306,8 +210,20 @@ struct tu_physical_device unsigned gpu_id; uint32_t gmem_size; + uint64_t gmem_base; + uint32_t ccu_offset_gmem; + uint32_t ccu_offset_bypass; + /* alignment for size of tiles */ uint32_t tile_align_w; - uint32_t tile_align_h; +#define TILE_ALIGN_H 16 + /* gmem store/load granularity */ +#define GMEM_ALIGN_W 16 +#define GMEM_ALIGN_H 4 + + struct { + uint32_t PC_UNKNOWN_9805; + uint32_t SP_UNKNOWN_A0F8; + } magic; /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. @@ -322,6 +238,10 @@ enum tu_debug_flags TU_DEBUG_STARTUP = 1 << 0, TU_DEBUG_NIR = 1 << 1, TU_DEBUG_IR3 = 1 << 2, + TU_DEBUG_NOBIN = 1 << 3, + TU_DEBUG_SYSMEM = 1 << 4, + TU_DEBUG_FORCEBIN = 1 << 5, + TU_DEBUG_NOUBWC = 1 << 6, }; struct tu_instance @@ -374,39 +294,6 @@ struct tu_pipeline_key { }; -void -tu_pipeline_cache_init(struct tu_pipeline_cache *cache, - struct tu_device *device); -void -tu_pipeline_cache_finish(struct tu_pipeline_cache *cache); -void -tu_pipeline_cache_load(struct tu_pipeline_cache *cache, - const void *data, - size_t size); - -struct tu_shader_variant; - -bool -tu_create_shader_variants_from_pipeline_cache( - struct tu_device *device, - struct tu_pipeline_cache *cache, - const unsigned char *sha1, - struct tu_shader_variant **variants); - -void -tu_pipeline_cache_insert_shaders(struct tu_device *device, - struct tu_pipeline_cache *cache, - const unsigned char *sha1, - struct tu_shader_variant **variants, - const void *const *codes, - const unsigned *code_sizes); - -struct tu_meta_state -{ - VkAllocationCallbacks alloc; - - struct tu_pipeline_cache cache; -}; /* queue types */ #define TU_QUEUE_GENERAL 0 @@ -415,6 +302,7 @@ struct tu_meta_state struct tu_fence { + struct wsi_fence *fence_wsi; bool signaled; int fd; }; @@ -444,6 +332,14 @@ struct tu_queue struct tu_fence submit_fence; }; +struct tu_bo +{ + uint32_t gem_handle; + uint64_t size; + uint64_t iova; + void *map; +}; + struct tu_device { VK_LOADER_DATA _loader_data; @@ -452,8 +348,6 @@ struct tu_device struct tu_instance *instance; - struct tu_meta_state meta_state; - struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES]; int queue_count[TU_MAX_QUEUE_FAMILIES]; @@ -464,18 +358,25 @@ struct tu_device /* Backup in-memory cache to be used if the app doesn't provide one */ struct tu_pipeline_cache *mem_cache; - struct list_head shader_slabs; - mtx_t shader_slab_mutex; + struct tu_bo vsc_draw_strm; + struct tu_bo vsc_prim_strm; + uint32_t vsc_draw_strm_pitch; + uint32_t vsc_prim_strm_pitch; - struct tu_device_extension_table enabled_extensions; -}; +#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */ -struct tu_bo -{ - uint32_t gem_handle; - uint64_t size; - uint64_t iova; - void *map; + /* Currently the kernel driver uses a 32-bit GPU address space, but it + * should be impossible to go beyond 48 bits. + */ + struct { + struct tu_bo bo; + mtx_t construct_mtx; + bool initialized; + } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2]; + + struct tu_bo border_color; + + struct tu_device_extension_table enabled_extensions; }; VkResult @@ -492,6 +393,15 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo); VkResult tu_bo_map(struct tu_device *dev, struct tu_bo *bo); +/* Get a scratch bo for use inside a command buffer. This will always return + * the same bo given the same size or similar sizes, so only one scratch bo + * can be used at the same time. It's meant for short-lived things where we + * need to write to some piece of memory, read from it, and then immediately + * discard it. + */ +VkResult +tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo); + struct tu_cs_entry { /* No ownership */ @@ -501,6 +411,11 @@ struct tu_cs_entry uint32_t offset; }; +struct ts_cs_memory { + uint32_t *map; + uint64_t iova; +}; + enum tu_cs_mode { @@ -541,6 +456,7 @@ struct tu_cs uint32_t *reserved_end; uint32_t *end; + struct tu_device *device; enum tu_cs_mode mode; uint32_t next_bo_size; @@ -551,6 +467,10 @@ struct tu_cs struct tu_bo **bos; uint32_t bo_count; uint32_t bo_capacity; + + /* state for cond_exec_start/cond_exec_end */ + uint32_t cond_flags; + uint32_t *cond_dwords; }; struct tu_device_memory @@ -576,11 +496,15 @@ struct tu_descriptor_range struct tu_descriptor_set { const struct tu_descriptor_set_layout *layout; + struct tu_descriptor_pool *pool; uint32_t size; uint64_t va; uint32_t *mapped_ptr; - struct tu_descriptor_range *dynamic_descriptors; + + uint32_t *dynamic_descriptors; + + struct tu_bo *buffers[0]; }; struct tu_push_descriptor_set @@ -598,7 +522,7 @@ struct tu_descriptor_pool_entry struct tu_descriptor_pool { - uint8_t *mapped_ptr; + struct tu_bo bo; uint64_t current_offset; uint64_t size; @@ -641,7 +565,6 @@ struct tu_descriptor_update_template_entry struct tu_descriptor_update_template { uint32_t entry_count; - VkPipelineBindPoint bind_point; struct tu_descriptor_update_template_entry entry[0]; }; @@ -656,6 +579,12 @@ struct tu_buffer VkDeviceSize bo_offset; }; +static inline uint64_t +tu_buffer_iova(struct tu_buffer *buffer) +{ + return buffer->bo->iova + buffer->bo_offset; +} + enum tu_dynamic_state_bits { TU_DYNAMIC_VIEWPORT = 1 << 0, @@ -668,7 +597,8 @@ enum tu_dynamic_state_bits TU_DYNAMIC_STENCIL_WRITE_MASK = 1 << 7, TU_DYNAMIC_STENCIL_REFERENCE = 1 << 8, TU_DYNAMIC_DISCARD_RECTANGLE = 1 << 9, - TU_DYNAMIC_ALL = (1 << 10) - 1, + TU_DYNAMIC_SAMPLE_LOCATIONS = 1 << 10, + TU_DYNAMIC_ALL = (1 << 11) - 1, }; struct tu_vertex_binding @@ -753,27 +683,14 @@ tu_get_debug_option_name(int id); const char * tu_get_perftest_option_name(int id); -/** - * Attachment state when recording a renderpass instance. - * - * The clear value is valid only if there exists a pending clear. - */ -struct tu_attachment_state -{ - VkImageAspectFlags pending_clear_aspects; - uint32_t cleared_views; - VkClearValue clear_value; - VkImageLayout current_layout; -}; - struct tu_descriptor_state { struct tu_descriptor_set *sets[MAX_SETS]; - uint32_t dirty; uint32_t valid; struct tu_push_descriptor_set push_set; bool push_dirty; - uint32_t dynamic_buffers[4 * MAX_DYNAMIC_BUFFERS]; + uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS]; + uint32_t input_attachments[MAX_RTS * A6XX_TEX_CONST_DWORDS]; }; struct tu_tile @@ -787,16 +704,12 @@ struct tu_tile struct tu_tiling_config { VkRect2D render_area; - uint32_t buffer_cpp[MAX_RTS + 2]; - uint32_t buffer_count; /* position and size of the first tile */ VkRect2D tile0; /* number of tiles */ VkExtent2D tile_count; - uint32_t gmem_offsets[MAX_RTS + 2]; - /* size of the first VSC pipe */ VkExtent2D pipe0; /* number of VSC pipes */ @@ -805,17 +718,146 @@ struct tu_tiling_config /* pipe register values */ uint32_t pipe_config[MAX_VSC_PIPES]; uint32_t pipe_sizes[MAX_VSC_PIPES]; + + /* Whether sysmem rendering must be used */ + bool force_sysmem; }; enum tu_cmd_dirty_bits { TU_CMD_DIRTY_PIPELINE = 1 << 0, - TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1, + TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1, + TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2, + + TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3, + TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 4, + TU_CMD_DIRTY_SHADER_CONSTS = 1 << 5, + TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 6, TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16, TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17, TU_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK = 1 << 18, TU_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE = 1 << 19, + TU_CMD_DIRTY_DYNAMIC_VIEWPORT = 1 << 20, + TU_CMD_DIRTY_DYNAMIC_SCISSOR = 1 << 21, +}; + +struct tu_streamout_state { + uint16_t stride[IR3_MAX_SO_BUFFERS]; + uint32_t ncomp[IR3_MAX_SO_BUFFERS]; + uint32_t prog[IR3_MAX_SO_OUTPUTS * 2]; + uint32_t prog_count; + uint32_t vpc_so_buf_cntl; +}; + +/* There are only three cache domains we have to care about: the CCU, or + * color cache unit, which is used for color and depth/stencil attachments + * and copy/blit destinations, and is split conceptually into color and depth, + * and the universal cache or UCHE which is used for pretty much everything + * else, except for the CP (uncached) and host. We need to flush whenever data + * crosses these boundaries. + */ + +enum tu_cmd_access_mask { + TU_ACCESS_UCHE_READ = 1 << 0, + TU_ACCESS_UCHE_WRITE = 1 << 1, + TU_ACCESS_CCU_COLOR_READ = 1 << 2, + TU_ACCESS_CCU_COLOR_WRITE = 1 << 3, + TU_ACCESS_CCU_DEPTH_READ = 1 << 4, + TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5, + + /* Experiments have shown that while it's safe to avoid flushing the CCU + * after each blit/renderpass, it's not safe to assume that subsequent + * lookups with a different attachment state will hit unflushed cache + * entries. That is, the CCU needs to be flushed and possibly invalidated + * when accessing memory with a different attachment state. Writing to an + * attachment under the following conditions after clearing using the + * normal 2d engine path is known to have issues: + * + * - It isn't the 0'th layer. + * - There are more than one attachment, and this isn't the 0'th attachment + * (this seems to also depend on the cpp of the attachments). + * + * Our best guess is that the layer/MRT state is used when computing + * the location of a cache entry in CCU, to avoid conflicts. We assume that + * any access in a renderpass after or before an access by a transfer needs + * a flush/invalidate, and use the _INCOHERENT variants to represent access + * by a transfer. + */ + TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6, + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7, + TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8, + TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9, + + TU_ACCESS_SYSMEM_READ = 1 << 10, + TU_ACCESS_SYSMEM_WRITE = 1 << 11, + + /* Set if a WFI is required due to data being read by the CP or the 2D + * engine. + */ + TU_ACCESS_WFI_READ = 1 << 12, + + TU_ACCESS_READ = + TU_ACCESS_UCHE_READ | + TU_ACCESS_CCU_COLOR_READ | + TU_ACCESS_CCU_DEPTH_READ | + TU_ACCESS_CCU_COLOR_INCOHERENT_READ | + TU_ACCESS_CCU_DEPTH_INCOHERENT_READ | + TU_ACCESS_SYSMEM_READ, + + TU_ACCESS_WRITE = + TU_ACCESS_UCHE_WRITE | + TU_ACCESS_CCU_COLOR_WRITE | + TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE | + TU_ACCESS_CCU_DEPTH_WRITE | + TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE | + TU_ACCESS_SYSMEM_WRITE, + + TU_ACCESS_ALL = + TU_ACCESS_READ | + TU_ACCESS_WRITE, +}; + +enum tu_cmd_flush_bits { + TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0, + TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1, + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2, + TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3, + TU_CMD_FLAG_CACHE_FLUSH = 1 << 4, + TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5, + + TU_CMD_FLAG_ALL_FLUSH = + TU_CMD_FLAG_CCU_FLUSH_DEPTH | + TU_CMD_FLAG_CCU_FLUSH_COLOR | + TU_CMD_FLAG_CACHE_FLUSH, + + TU_CMD_FLAG_ALL_INVALIDATE = + TU_CMD_FLAG_CCU_INVALIDATE_DEPTH | + TU_CMD_FLAG_CCU_INVALIDATE_COLOR | + TU_CMD_FLAG_CACHE_INVALIDATE, + + TU_CMD_FLAG_WFI = 1 << 6, +}; + +/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty + * heavy, involving a CCU cache flush/invalidate and a WFI in order to change + * which part of the gmem is used by the CCU. Here we keep track of what the + * state of the CCU. + */ +enum tu_cmd_ccu_state { + TU_CMD_CCU_SYSMEM, + TU_CMD_CCU_GMEM, + TU_CMD_CCU_UNKNOWN, +}; + +struct tu_cache_state { + /* Caches which must be made available (flushed) eventually if there are + * any users outside that cache domain, and caches which must be + * invalidated eventually if there are any reads. + */ + enum tu_cmd_flush_bits pending_flush_bits; + /* Pending flushes */ + enum tu_cmd_flush_bits flush_bits; }; struct tu_cmd_state @@ -823,6 +865,7 @@ struct tu_cmd_state uint32_t dirty; struct tu_pipeline *pipeline; + struct tu_pipeline *compute_pipeline; /* Vertex buffers */ struct @@ -833,6 +876,17 @@ struct tu_cmd_state struct tu_dynamic_state dynamic; + /* Stream output buffers */ + struct + { + struct tu_buffer *buffers[IR3_MAX_SO_BUFFERS]; + VkDeviceSize offsets[IR3_MAX_SO_BUFFERS]; + VkDeviceSize sizes[IR3_MAX_SO_BUFFERS]; + } streamout_buf; + + uint8_t streamout_reset; + uint8_t streamout_enabled; + /* Index buffer */ struct tu_buffer *index_buffer; uint64_t index_offset; @@ -840,14 +894,23 @@ struct tu_cmd_state uint32_t max_index_count; uint64_t index_va; + /* Renderpasses are tricky, because we may need to flush differently if + * using sysmem vs. gmem and therefore we have to delay any flushing that + * happens before a renderpass. So we have to have two copies of the flush + * state, one for intra-renderpass flushes (i.e. renderpass dependencies) + * and one for outside a renderpass. + */ + struct tu_cache_state cache; + struct tu_cache_state renderpass_cache; + + enum tu_cmd_ccu_state ccu_state; + const struct tu_render_pass *pass; const struct tu_subpass *subpass; const struct tu_framebuffer *framebuffer; - struct tu_attachment_state *attachments; struct tu_tiling_config tiling_config; - struct tu_cs_entry tile_load_ib; struct tu_cs_entry tile_store_ib; }; @@ -898,6 +961,28 @@ tu_bo_list_add(struct tu_bo_list *list, VkResult tu_bo_list_merge(struct tu_bo_list *list, const struct tu_bo_list *other); +/* This struct defines the layout of the scratch_bo */ +struct tu6_control +{ + uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */ + uint32_t _pad0; + volatile uint32_t vsc_overflow; + uint32_t _pad1; + /* flag set from cmdstream when VSC overflow detected: */ + uint32_t vsc_scratch; + uint32_t _pad2; + uint32_t _pad3; + uint32_t _pad4; + + /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ + struct { + uint32_t offset; + uint32_t pad[7]; + } flush_base[4]; +}; + +#define ctrl_offset(member) offsetof(struct tu6_control, member) + struct tu_cmd_buffer { VK_LOADER_DATA _loader_data; @@ -913,13 +998,14 @@ struct tu_cmd_buffer struct tu_cmd_state state; struct tu_vertex_binding vertex_bindings[MAX_VBS]; + uint32_t vertex_bindings_set; uint32_t queue_family_index; - uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE]; + uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4]; VkShaderStageFlags push_constant_stages; struct tu_descriptor_set meta_push_descriptors; - struct tu_descriptor_state descriptors[VK_PIPELINE_BIND_POINT_RANGE_SIZE]; + struct tu_descriptor_state descriptors[MAX_BIND_POINTS]; struct tu_cmd_buffer_upload upload; @@ -928,58 +1014,56 @@ struct tu_cmd_buffer struct tu_bo_list bo_list; struct tu_cs cs; struct tu_cs draw_cs; - struct tu_cs tile_cs; - - uint16_t marker_reg; - uint32_t marker_seqno; + struct tu_cs draw_epilogue_cs; + struct tu_cs sub_cs; struct tu_bo scratch_bo; - uint32_t scratch_seqno; - bool wait_for_idle; + struct tu_bo vsc_draw_strm; + struct tu_bo vsc_prim_strm; + uint32_t vsc_draw_strm_pitch; + uint32_t vsc_prim_strm_pitch; + bool use_vsc_data; +}; + +/* Temporary struct for tracking a register state to be written, used by + * a6xx-pack.h and tu_cs_emit_regs() + */ +struct tu_reg_value { + uint32_t reg; + uint64_t value; + bool is_address; + struct tu_bo *bo; + bool bo_write; + uint32_t bo_offset; + uint32_t bo_shift; }; + +void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs); + +void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer, + struct tu_cs *cs, + enum tu_cmd_ccu_state ccu_state); + void tu6_emit_event_write(struct tu_cmd_buffer *cmd, struct tu_cs *cs, - enum vgt_event_type event, - bool need_seqno); + enum vgt_event_type event); -bool -tu_get_memory_fd(struct tu_device *device, - struct tu_device_memory *memory, - int *pFD); - -/* - * Takes x,y,z as exact numbers of invocations, instead of blocks. - * - * Limitations: Can't call normal dispatch functions without binding or - * rebinding - * the compute pipeline. - */ -void -tu_unaligned_dispatch(struct tu_cmd_buffer *cmd_buffer, - uint32_t x, - uint32_t y, - uint32_t z); +static inline struct tu_descriptor_state * +tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer, + VkPipelineBindPoint bind_point) +{ + return &cmd_buffer->descriptors[bind_point]; +} struct tu_event { - uint64_t *map; + struct tu_bo bo; }; -struct tu_shader_module; - -#define TU_HASH_SHADER_IS_GEOM_COPY_SHADER (1 << 0) -#define TU_HASH_SHADER_SISCHED (1 << 1) -#define TU_HASH_SHADER_UNSAFE_MATH (1 << 2) -void -tu_hash_shaders(unsigned char *hash, - const VkPipelineShaderStageCreateInfo **stages, - const struct tu_pipeline_layout *layout, - const struct tu_pipeline_key *key, - uint32_t flags); - static inline gl_shader_stage vk_to_mesa_shader_stage(VkShaderStageFlagBits vk_stage) { @@ -1000,6 +1084,11 @@ mesa_to_vk_shader_stage(gl_shader_stage mesa_stage) __tmp = (gl_shader_stage)((stage_bits) &TU_STAGE_MASK); \ stage = __builtin_ffs(__tmp) - 1, __tmp; __tmp &= ~(1 << (stage))) +uint32_t +tu6_stage2opcode(gl_shader_stage type); +enum a6xx_state_block +tu6_stage2shadersb(gl_shader_stage type); + struct tu_shader_module { unsigned char sha1[20]; @@ -1008,33 +1097,26 @@ struct tu_shader_module const uint32_t *code[0]; }; -struct tu_shader_compile_options +struct tu_push_constant_range { - struct ir3_shader_key key; - - bool optimize; - bool include_binning_pass; + uint32_t lo; + uint32_t count; }; struct tu_shader { - struct ir3_shader ir3_shader; + struct ir3_shader *ir3_shader; - /* This may be true for vertex shaders. When true, variants[1] is the - * binning variant and binning_binary is non-NULL. - */ - bool has_binning_pass; - - void *binary; - void *binning_binary; - - struct ir3_shader_variant variants[0]; + struct tu_push_constant_range push_consts; + unsigned attachment_idx[MAX_RTS]; + uint8_t active_desc_sets; }; struct tu_shader * tu_shader_create(struct tu_device *dev, gl_shader_stage stage, const VkPipelineShaderStageCreateInfo *stage_info, + struct tu_pipeline_layout *layout, const VkAllocationCallbacks *alloc); void @@ -1042,17 +1124,15 @@ tu_shader_destroy(struct tu_device *dev, struct tu_shader *shader, const VkAllocationCallbacks *alloc); -void -tu_shader_compile_options_init( - struct tu_shader_compile_options *options, - const VkGraphicsPipelineCreateInfo *pipeline_info); +struct tu_program_descriptor_linkage +{ + struct ir3_ubo_analysis_state ubo_state; + struct ir3_const_state const_state; -VkResult -tu_shader_compile(struct tu_device *dev, - struct tu_shader *shader, - const struct tu_shader *next_stage, - const struct tu_shader_compile_options *options, - const VkAllocationCallbacks *alloc); + uint32_t constlen; + + struct tu_push_constant_range push_consts; +}; struct tu_pipeline { @@ -1064,28 +1144,30 @@ struct tu_pipeline bool need_indirect_descriptor_sets; VkShaderStageFlags active_stages; + uint32_t active_desc_sets; + + struct tu_streamout_state streamout; struct { struct tu_bo binary_bo; struct tu_cs_entry state_ib; struct tu_cs_entry binning_state_ib; + + struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; + unsigned input_attachment_idx[MAX_RTS]; } program; struct { - uint8_t bindings[MAX_VERTEX_ATTRIBS]; - uint16_t strides[MAX_VERTEX_ATTRIBS]; - uint16_t offsets[MAX_VERTEX_ATTRIBS]; - uint32_t count; - - uint8_t binning_bindings[MAX_VERTEX_ATTRIBS]; - uint16_t binning_strides[MAX_VERTEX_ATTRIBS]; - uint16_t binning_offsets[MAX_VERTEX_ATTRIBS]; - uint32_t binning_count; + struct tu_cs_entry state_ib; + } load_state; + struct + { struct tu_cs_entry state_ib; struct tu_cs_entry binning_state_ib; + uint32_t bindings_used; } vi; struct @@ -1114,6 +1196,11 @@ struct tu_pipeline { struct tu_cs_entry state_ib; } blend; + + struct + { + uint32_t local_size[3]; + } compute; }; void @@ -1122,6 +1209,9 @@ tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport); void tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissor); +void +tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc); + void tu6_emit_gras_su_cntl(struct tu_cs *cs, uint32_t gras_su_cntl, @@ -1147,51 +1237,93 @@ tu6_emit_stencil_reference(struct tu_cs *cs, uint32_t front, uint32_t back); void tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4]); -struct tu_userdata_info * -tu_lookup_user_sgpr(struct tu_pipeline *pipeline, - gl_shader_stage stage, - int idx); +void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples); -struct tu_shader_variant * -tu_get_shader(struct tu_pipeline *pipeline, gl_shader_stage stage); +void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2); -struct tu_graphics_pipeline_create_info -{ - bool use_rectlist; - bool db_depth_clear; - bool db_stencil_clear; - bool db_depth_disable_expclear; - bool db_stencil_disable_expclear; - bool db_flush_depth_inplace; - bool db_flush_stencil_inplace; - bool db_resummarize; - uint32_t custom_blend_mode; +void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1); + +void +tu6_emit_xs_config(struct tu_cs *cs, + gl_shader_stage stage, + const struct ir3_shader_variant *xs, + uint64_t binary_iova); + +void +tu6_emit_vpc(struct tu_cs *cs, + const struct ir3_shader_variant *vs, + const struct ir3_shader_variant *gs, + const struct ir3_shader_variant *fs, + struct tu_streamout_state *tf); + +void +tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs); + +struct tu_image_view; + +void +tu_resolve_sysmem(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + struct tu_image_view *src, + struct tu_image_view *dst, + uint32_t layers, + const VkRect2D *rect); + +void +tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info); + +void +tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + const VkRenderPassBeginInfo *info); + +void +tu_load_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + bool force_load); + +/* expose this function to be able to emit load without checking LOAD_OP */ +void +tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a); + +/* note: gmem store can also resolve */ +void +tu_store_gmem_attachment(struct tu_cmd_buffer *cmd, + struct tu_cs *cs, + uint32_t a, + uint32_t gmem_a); + +enum tu_supported_formats { + FMT_VERTEX = 1, + FMT_TEXTURE = 2, + FMT_COLOR = 4, }; struct tu_native_format { - int vtx; /* VFMTn_xxx or -1 */ - int tex; /* TFMTn_xxx or -1 */ - int rb; /* RBn_xxx or -1 */ - int swap; /* enum a3xx_color_swap */ - bool present; /* internal only; always true to external users */ + enum a6xx_format fmt : 8; + enum a3xx_color_swap swap : 8; + enum a6xx_tile_mode tile_mode : 8; + enum tu_supported_formats supported : 8; }; -const struct tu_native_format * -tu6_get_native_format(VkFormat format); +struct tu_native_format tu6_format_vtx(VkFormat format); +struct tu_native_format tu6_format_color(VkFormat format, enum a6xx_tile_mode tile_mode); +struct tu_native_format tu6_format_texture(VkFormat format, enum a6xx_tile_mode tile_mode); -int -tu_pack_clear_value(const VkClearValue *val, - VkFormat format, - uint32_t buf[4]); -enum a6xx_2d_ifmt tu6_rb_fmt_to_ifmt(enum a6xx_color_fmt fmt); - -struct tu_image_level +static inline enum a6xx_format +tu6_base_format(VkFormat format) { - VkDeviceSize offset; - VkDeviceSize size; - uint32_t pitch; -}; + /* note: tu6_format_color doesn't care about tiling for .fmt field */ + return tu6_format_color(format, TILE6_LINEAR).fmt; +} + +enum a6xx_depth_format tu6_pipe2depth(VkFormat format); struct tu_image { @@ -1207,14 +1339,9 @@ struct tu_image VkExtent3D extent; uint32_t level_count; uint32_t layer_count; + VkSampleCountFlagBits samples; - VkDeviceSize size; - uint32_t alignment; - - /* memory layout */ - VkDeviceSize layer_size; - struct tu_image_level levels[15]; - unsigned tile_mode; + struct fdl_layout layout; unsigned queue_family_mask; bool exclusive; @@ -1224,15 +1351,10 @@ struct tu_image VkDeviceMemory owned_memory; /* Set when bound */ - const struct tu_bo *bo; + struct tu_bo *bo; VkDeviceSize bo_offset; }; -unsigned -tu_image_queue_family_mask(const struct tu_image *image, - uint32_t family, - uint32_t queue_family); - static inline uint32_t tu_get_layerCount(const struct tu_image *image, const VkImageSubresourceRange *range) @@ -1251,43 +1373,81 @@ tu_get_levelCount(const struct tu_image *image, : range->levelCount; } +enum a3xx_msaa_samples +tu_msaa_samples(uint32_t samples); +enum a6xx_tex_fetchsize +tu6_fetchsize(VkFormat format); + struct tu_image_view { struct tu_image *image; /**< VkImageViewCreateInfo::image */ - VkImageViewType type; - VkImageAspectFlags aspect_mask; - VkFormat vk_format; - uint32_t base_layer; - uint32_t layer_count; - uint32_t base_mip; - uint32_t level_count; - VkExtent3D extent; /**< Extent of VkImageViewCreateInfo::baseMipLevel. */ + uint64_t base_addr; + uint64_t ubwc_addr; + uint32_t layer_size; + uint32_t ubwc_layer_size; + + /* used to determine if fast gmem store path can be used */ + VkExtent2D extent; + bool need_y2_align; - uint32_t descriptor[16]; + bool ubwc_enabled; + + uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; /* Descriptor for use as a storage image as opposed to a sampled image. * This has a few differences for cube maps (e.g. type). */ - uint32_t storage_descriptor[16]; + uint32_t storage_descriptor[A6XX_TEX_CONST_DWORDS]; + + /* pre-filled register values */ + uint32_t PITCH; + uint32_t FLAG_BUFFER_PITCH; + + uint32_t RB_MRT_BUF_INFO; + uint32_t SP_FS_MRT_REG; + + uint32_t SP_PS_2D_SRC_INFO; + uint32_t SP_PS_2D_SRC_SIZE; + + uint32_t RB_2D_DST_INFO; + + uint32_t RB_BLIT_DST_INFO; }; -struct tu_sampler -{ +struct tu_sampler_ycbcr_conversion { + VkFormat format; + VkSamplerYcbcrModelConversion ycbcr_model; + VkSamplerYcbcrRange ycbcr_range; + VkComponentMapping components; + VkChromaLocation chroma_offsets[2]; + VkFilter chroma_filter; }; -struct tu_image_create_info -{ - const VkImageCreateInfo *vk_info; - bool scanout; - bool no_metadata_planes; +struct tu_sampler { + uint32_t descriptor[A6XX_TEX_SAMP_DWORDS]; + struct tu_sampler_ycbcr_conversion *ycbcr_sampler; }; +void +tu_cs_image_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); + +void +tu_cs_image_ref_2d(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer, bool src); + +void +tu_cs_image_flag_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer); + +enum a6xx_tex_filter +tu6_tex_filter(VkFilter filter, unsigned aniso); + VkResult tu_image_create(VkDevice _device, - const struct tu_image_create_info *info, + const VkImageCreateInfo *pCreateInfo, const VkAllocationCallbacks *alloc, - VkImage *pImage); + VkImage *pImage, + uint64_t modifier, + const VkSubresourceLayout *plane_layouts); VkResult tu_image_from_gralloc(VkDevice device_h, @@ -1298,52 +1458,19 @@ tu_image_from_gralloc(VkDevice device_h, void tu_image_view_init(struct tu_image_view *view, - struct tu_device *device, const VkImageViewCreateInfo *pCreateInfo); struct tu_buffer_view { - VkFormat vk_format; - uint64_t range; /**< VkBufferViewCreateInfo::range */ - uint32_t state[4]; + uint32_t descriptor[A6XX_TEX_CONST_DWORDS]; + + struct tu_buffer *buffer; }; void tu_buffer_view_init(struct tu_buffer_view *view, struct tu_device *device, const VkBufferViewCreateInfo *pCreateInfo); -static inline struct VkExtent3D -tu_sanitize_image_extent(const VkImageType imageType, - const struct VkExtent3D imageExtent) -{ - switch (imageType) { - case VK_IMAGE_TYPE_1D: - return (VkExtent3D) { imageExtent.width, 1, 1 }; - case VK_IMAGE_TYPE_2D: - return (VkExtent3D) { imageExtent.width, imageExtent.height, 1 }; - case VK_IMAGE_TYPE_3D: - return imageExtent; - default: - unreachable("invalid image type"); - } -} - -static inline struct VkOffset3D -tu_sanitize_image_offset(const VkImageType imageType, - const struct VkOffset3D imageOffset) -{ - switch (imageType) { - case VK_IMAGE_TYPE_1D: - return (VkOffset3D) { imageOffset.x, 0, 0 }; - case VK_IMAGE_TYPE_2D: - return (VkOffset3D) { imageOffset.x, imageOffset.y, 0 }; - case VK_IMAGE_TYPE_3D: - return imageOffset; - default: - unreachable("invalid image type"); - } -} - struct tu_attachment_info { struct tu_image_view *attachment; @@ -1359,17 +1486,13 @@ struct tu_framebuffer struct tu_attachment_info attachments[0]; }; -struct tu_subpass_barrier -{ +struct tu_subpass_barrier { VkPipelineStageFlags src_stage_mask; VkAccessFlags src_access_mask; VkAccessFlags dst_access_mask; + bool incoherent_ccu_color, incoherent_ccu_depth; }; -void -tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer, - const struct tu_subpass_barrier *barrier); - struct tu_subpass_attachment { uint32_t attachment; @@ -1385,49 +1508,45 @@ struct tu_subpass struct tu_subpass_attachment *resolve_attachments; struct tu_subpass_attachment depth_stencil_attachment; - /** Subpass has at least one resolve attachment */ - bool has_resolve; + VkSampleCountFlagBits samples; + bool has_external_src, has_external_dst; - struct tu_subpass_barrier start_barrier; + uint32_t srgb_cntl; - uint32_t view_mask; - VkSampleCountFlagBits max_sample_count; + struct tu_subpass_barrier start_barrier; }; struct tu_render_pass_attachment { VkFormat format; uint32_t samples; - VkAttachmentLoadOp load_op; - VkAttachmentLoadOp stencil_load_op; - VkImageLayout initial_layout; - VkImageLayout final_layout; - uint32_t view_mask; + uint32_t cpp; + VkImageAspectFlags clear_mask; + bool load; + bool store; + VkImageLayout initial_layout, final_layout; + int32_t gmem_offset; }; struct tu_render_pass { uint32_t attachment_count; uint32_t subpass_count; + uint32_t gmem_pixels; + uint32_t tile_align_w; struct tu_subpass_attachment *subpass_attachments; struct tu_render_pass_attachment *attachments; struct tu_subpass_barrier end_barrier; struct tu_subpass subpasses[0]; }; -VkResult -tu_device_init_meta(struct tu_device *device); -void -tu_device_finish_meta(struct tu_device *device); - struct tu_query_pool { + VkQueryType type; uint32_t stride; - uint32_t availability_offset; uint64_t size; - char *ptr; - VkQueryType type; - uint32_t pipeline_stats_mask; + uint32_t pipeline_statistics; + struct tu_bo bo; }; struct tu_semaphore @@ -1459,20 +1578,15 @@ tu_update_descriptor_set_with_template( VkDescriptorUpdateTemplate descriptorUpdateTemplate, const void *pData); -void -tu_meta_push_descriptor_set(struct tu_cmd_buffer *cmd_buffer, - VkPipelineBindPoint pipelineBindPoint, - VkPipelineLayout _layout, - uint32_t set, - uint32_t descriptorWriteCount, - const VkWriteDescriptorSet *pDescriptorWrites); - int tu_drm_get_gpu_id(const struct tu_physical_device *dev, uint32_t *id); int tu_drm_get_gmem_size(const struct tu_physical_device *dev, uint32_t *size); +int +tu_drm_get_gmem_base(const struct tu_physical_device *dev, uint64_t *base); + int tu_drm_submitqueue_new(const struct tu_device *dev, int priority, @@ -1550,6 +1664,7 @@ TU_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, VkPipelineLayout) TU_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, VkQueryPool) TU_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, VkRenderPass) TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, VkSampler) +TU_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, VkSamplerYcbcrConversion) TU_DEFINE_NONDISP_HANDLE_CASTS(tu_shader_module, VkShaderModule) TU_DEFINE_NONDISP_HANDLE_CASTS(tu_semaphore, VkSemaphore)