X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeon%2Fradeon_winsys.h;h=525c28182ed17e9497bc7765adf6c6799340d77d;hb=a5e7c12cedb8a91236dd3caf99133f86349702a9;hp=806ea6378c3791d29c892219ad9415441a3873ae;hpb=89ba076de4c8cfa171365700e6a3b017d5e3eeff;p=mesa.git diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 806ea6378c3..525c28182ed 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -1,6 +1,8 @@ /* * Copyright 2008 Corbin Simpson * Copyright 2010 Marek Olšák + * Copyright 2018 Advanced Micro Devices, Inc. + * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -26,11 +28,17 @@ /* The public winsys interface header for the radeon driver. */ +/* Whether the next IB can start immediately and not wait for draws and + * dispatches from the current IB to finish. */ +#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31) + +#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \ + (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW) + #include "pipebuffer/pb_buffer.h" -#define RADEON_FLUSH_ASYNC (1 << 0) -#define RADEON_FLUSH_KEEP_TILING_FLAGS (1 << 1) -#define RADEON_FLUSH_END_OF_FRAME (1 << 2) +#include "amd/common/ac_gpu_info.h" +#include "amd/common/ac_surface.h" /* Tiling flags. */ enum radeon_bo_layout { @@ -44,251 +52,157 @@ enum radeon_bo_layout { enum radeon_bo_domain { /* bitfield */ RADEON_DOMAIN_GTT = 2, RADEON_DOMAIN_VRAM = 4, - RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT + RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT, + RADEON_DOMAIN_GDS = 8, + RADEON_DOMAIN_OA = 16, }; enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_GTT_WC = (1 << 0), - RADEON_FLAG_CPU_ACCESS = (1 << 1), - RADEON_FLAG_NO_CPU_ACCESS = (1 << 2), + RADEON_FLAG_NO_CPU_ACCESS = (1 << 1), + RADEON_FLAG_NO_SUBALLOC = (1 << 2), + RADEON_FLAG_SPARSE = (1 << 3), + RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4), + RADEON_FLAG_READ_ONLY = (1 << 5), + RADEON_FLAG_32BIT = (1 << 6), +}; + +enum radeon_dependency_flag { + /* Add the dependency to the parallel compute IB only. */ + RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0, + + /* Instead of waiting for a job to finish execution, the dependency will + * be signaled when the job starts execution. + */ + RADEON_DEPENDENCY_START_FENCE = 1 << 1, }; enum radeon_bo_usage { /* bitfield */ RADEON_USAGE_READ = 2, RADEON_USAGE_WRITE = 4, - RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE -}; + RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE, -enum radeon_family { - CHIP_UNKNOWN = 0, - CHIP_R300, /* R3xx-based cores. */ - CHIP_R350, - CHIP_RV350, - CHIP_RV370, - CHIP_RV380, - CHIP_RS400, - CHIP_RC410, - CHIP_RS480, - CHIP_R420, /* R4xx-based cores. */ - CHIP_R423, - CHIP_R430, - CHIP_R480, - CHIP_R481, - CHIP_RV410, - CHIP_RS600, - CHIP_RS690, - CHIP_RS740, - CHIP_RV515, /* R5xx-based cores. */ - CHIP_R520, - CHIP_RV530, - CHIP_R580, - CHIP_RV560, - CHIP_RV570, - CHIP_R600, - CHIP_RV610, - CHIP_RV630, - CHIP_RV670, - CHIP_RV620, - CHIP_RV635, - CHIP_RS780, - CHIP_RS880, - CHIP_RV770, - CHIP_RV730, - CHIP_RV710, - CHIP_RV740, - CHIP_CEDAR, - CHIP_REDWOOD, - CHIP_JUNIPER, - CHIP_CYPRESS, - CHIP_HEMLOCK, - CHIP_PALM, - CHIP_SUMO, - CHIP_SUMO2, - CHIP_BARTS, - CHIP_TURKS, - CHIP_CAICOS, - CHIP_CAYMAN, - CHIP_ARUBA, - CHIP_TAHITI, - CHIP_PITCAIRN, - CHIP_VERDE, - CHIP_OLAND, - CHIP_HAINAN, - CHIP_BONAIRE, - CHIP_KAVERI, - CHIP_KABINI, - CHIP_HAWAII, - CHIP_MULLINS, - CHIP_TONGA, - CHIP_ICELAND, - CHIP_CARRIZO, - CHIP_FIJI, - CHIP_STONEY, - CHIP_POLARIS10, - CHIP_POLARIS11, - CHIP_LAST, + /* The winsys ensures that the CS submission will be scheduled after + * previously flushed CSs referencing this BO in a conflicting way. + */ + RADEON_USAGE_SYNCHRONIZED = 8 }; -enum chip_class { - CLASS_UNKNOWN = 0, - R300, - R400, - R500, - R600, - R700, - EVERGREEN, - CAYMAN, - SI, - CIK, - VI, +enum radeon_transfer_flags { + /* Indicates that the caller will unmap the buffer. + * + * Not unmapping buffers is an important performance optimization for + * OpenGL (avoids kernel overhead for frequently mapped buffers). + */ + RADEON_TRANSFER_TEMPORARY = (PIPE_TRANSFER_DRV_PRV << 0), }; +#define RADEON_SPARSE_PAGE_SIZE (64 * 1024) + enum ring_type { RING_GFX = 0, RING_COMPUTE, RING_DMA, RING_UVD, RING_VCE, + RING_UVD_ENC, + RING_VCN_DEC, + RING_VCN_ENC, + RING_VCN_JPEG, RING_LAST, }; enum radeon_value_id { RADEON_REQUESTED_VRAM_MEMORY, RADEON_REQUESTED_GTT_MEMORY, + RADEON_MAPPED_VRAM, + RADEON_MAPPED_GTT, RADEON_BUFFER_WAIT_TIME_NS, + RADEON_NUM_MAPPED_BUFFERS, RADEON_TIMESTAMP, - RADEON_NUM_CS_FLUSHES, + RADEON_NUM_GFX_IBS, + RADEON_NUM_SDMA_IBS, + RADEON_GFX_BO_LIST_COUNTER, /* number of BOs submitted in gfx IBs */ + RADEON_GFX_IB_SIZE_COUNTER, RADEON_NUM_BYTES_MOVED, + RADEON_NUM_EVICTIONS, + RADEON_NUM_VRAM_CPU_PAGE_FAULTS, RADEON_VRAM_USAGE, + RADEON_VRAM_VIS_USAGE, RADEON_GTT_USAGE, RADEON_GPU_TEMPERATURE, /* DRM 2.42.0 */ RADEON_CURRENT_SCLK, RADEON_CURRENT_MCLK, - RADEON_GPU_RESET_COUNTER, /* DRM 2.43.0 */ + RADEON_CS_THREAD_TIME, }; -/* Each group of four has the same priority. */ enum radeon_bo_priority { + /* Each group of two has the same priority. */ RADEON_PRIO_FENCE = 0, RADEON_PRIO_TRACE, - RADEON_PRIO_SO_FILLED_SIZE, + + RADEON_PRIO_SO_FILLED_SIZE = 2, RADEON_PRIO_QUERY, RADEON_PRIO_IB1 = 4, /* main IB submitted to the kernel */ RADEON_PRIO_IB2, /* IB executed with INDIRECT_BUFFER */ - RADEON_PRIO_DRAW_INDIRECT, + + RADEON_PRIO_DRAW_INDIRECT = 6, RADEON_PRIO_INDEX_BUFFER, RADEON_PRIO_CP_DMA = 8, + RADEON_PRIO_BORDER_COLORS, - RADEON_PRIO_VCE = 12, - RADEON_PRIO_UVD, - RADEON_PRIO_SDMA_BUFFER, - RADEON_PRIO_SDMA_TEXTURE, - - RADEON_PRIO_USER_SHADER = 16, - RADEON_PRIO_INTERNAL_SHADER, /* fetch shader, etc. */ - - /* gap: 20 */ - - RADEON_PRIO_CONST_BUFFER = 24, + RADEON_PRIO_CONST_BUFFER = 10, RADEON_PRIO_DESCRIPTORS, - RADEON_PRIO_BORDER_COLORS, - RADEON_PRIO_SAMPLER_BUFFER = 28, + RADEON_PRIO_SAMPLER_BUFFER = 12, RADEON_PRIO_VERTEX_BUFFER, - RADEON_PRIO_SHADER_RW_BUFFER = 32, - RADEON_PRIO_RINGS_STREAMOUT, - RADEON_PRIO_SCRATCH_BUFFER, + RADEON_PRIO_SHADER_RW_BUFFER = 14, RADEON_PRIO_COMPUTE_GLOBAL, - RADEON_PRIO_SAMPLER_TEXTURE = 36, + RADEON_PRIO_SAMPLER_TEXTURE = 16, RADEON_PRIO_SHADER_RW_IMAGE, - RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 40, + RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 18, + RADEON_PRIO_COLOR_BUFFER, + + RADEON_PRIO_DEPTH_BUFFER = 20, - RADEON_PRIO_COLOR_BUFFER = 44, + RADEON_PRIO_COLOR_BUFFER_MSAA = 22, - RADEON_PRIO_DEPTH_BUFFER = 48, + RADEON_PRIO_DEPTH_BUFFER_MSAA = 24, - RADEON_PRIO_COLOR_BUFFER_MSAA = 52, + RADEON_PRIO_SEPARATE_META = 26, + RADEON_PRIO_SHADER_BINARY, /* the hw can't hide instruction cache misses */ - RADEON_PRIO_DEPTH_BUFFER_MSAA = 56, + RADEON_PRIO_SHADER_RINGS = 28, - RADEON_PRIO_CMASK = 60, - RADEON_PRIO_DCC, - RADEON_PRIO_HTILE, - /* 63 is the maximum value */ + RADEON_PRIO_SCRATCH_BUFFER = 30, + /* 31 is the maximum value */ }; struct winsys_handle; struct radeon_winsys_ctx; -struct radeon_winsys_cs_chunk { +struct radeon_cmdbuf_chunk { unsigned cdw; /* Number of used dwords. */ unsigned max_dw; /* Maximum number of dwords. */ uint32_t *buf; /* The base pointer of the chunk. */ }; -struct radeon_winsys_cs { - struct radeon_winsys_cs_chunk current; - struct radeon_winsys_cs_chunk *prev; +struct radeon_cmdbuf { + struct radeon_cmdbuf_chunk current; + struct radeon_cmdbuf_chunk *prev; unsigned num_prev; /* Number of previous chunks. */ unsigned max_prev; /* Space in array pointed to by prev. */ unsigned prev_dw; /* Total number of dwords in previous chunks. */ -}; -struct radeon_info { - /* PCI info: domain:bus:dev:func */ - uint32_t pci_domain; - uint32_t pci_bus; - uint32_t pci_dev; - uint32_t pci_func; - - /* Device info. */ - uint32_t pci_id; - enum radeon_family family; - enum chip_class chip_class; - uint32_t gart_page_size; - uint64_t gart_size; - uint64_t vram_size; - bool has_dedicated_vram; - boolean has_virtual_memory; - bool gfx_ib_pad_with_type2; - boolean has_sdma; - boolean has_uvd; - uint32_t vce_fw_version; - uint32_t vce_harvest_config; - uint32_t clock_crystal_freq; - - /* Kernel info. */ - uint32_t drm_major; /* version */ - uint32_t drm_minor; - uint32_t drm_patchlevel; - boolean has_userptr; - - /* Shader cores. */ - uint32_t r600_max_quad_pipes; /* wave size / 16 */ - uint32_t max_shader_clock; - uint32_t num_good_compute_units; - uint32_t max_se; /* shader engines */ - uint32_t max_sh_per_se; /* shader arrays per shader engine */ - - /* Render backends (color + depth blocks). */ - uint32_t r300_num_gb_pipes; - uint32_t r300_num_z_pipes; - uint32_t r600_gb_backend_map; /* R600 harvest config */ - boolean r600_gb_backend_map_valid; - uint32_t r600_num_banks; - uint32_t num_render_backends; - uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */ - uint32_t pipe_interleave_bytes; - uint32_t enabled_rb_mask; /* GCN harvest config */ - - /* Tile modes. */ - uint32_t si_tile_mode_array[32]; - uint32_t cik_macrotile_mode_array[16]; + /* Memory usage of the buffer list. These are always 0 for preamble IBs. */ + uint64_t used_vram; + uint64_t used_gart; + uint64_t gpu_address; }; /* Tiling info for display code, DRI sharing, and other data. */ @@ -296,16 +210,31 @@ struct radeon_bo_metadata { /* Tiling flags describing the texture layout for display code * and DRI sharing. */ - enum radeon_bo_layout microtile; - enum radeon_bo_layout macrotile; - unsigned pipe_config; - unsigned bankw; - unsigned bankh; - unsigned tile_split; - unsigned mtilea; - unsigned num_banks; - unsigned stride; - bool scanout; + union { + struct { + enum radeon_bo_layout microtile; + enum radeon_bo_layout macrotile; + unsigned pipe_config; + unsigned bankw; + unsigned bankh; + unsigned tile_split; + unsigned mtilea; + unsigned num_banks; + unsigned stride; + bool scanout; + } legacy; + + struct { + /* surface flags */ + unsigned swizzle_mode:5; + + /* DCC flags */ + /* [31:8]: max offset = 4GB - 256; 0 = DCC disabled */ + unsigned dcc_offset_256B:24; + unsigned dcc_pitch_max:14; /* (mip chain pitch - 1) for DCN */ + unsigned dcc_independent_64B:1; + } gfx9; + } u; /* Additional metadata associated with the buffer, in bytes. * The maximum size is 64 * 4. This is opaque for the winsys & kernel. @@ -320,90 +249,10 @@ enum radeon_feature_id { RADEON_FID_R300_CMASK_ACCESS, }; -#define RADEON_SURF_MAX_LEVEL 32 - -#define RADEON_SURF_TYPE_MASK 0xFF -#define RADEON_SURF_TYPE_SHIFT 0 -#define RADEON_SURF_TYPE_1D 0 -#define RADEON_SURF_TYPE_2D 1 -#define RADEON_SURF_TYPE_3D 2 -#define RADEON_SURF_TYPE_CUBEMAP 3 -#define RADEON_SURF_TYPE_1D_ARRAY 4 -#define RADEON_SURF_TYPE_2D_ARRAY 5 -#define RADEON_SURF_MODE_MASK 0xFF -#define RADEON_SURF_MODE_SHIFT 8 -#define RADEON_SURF_MODE_LINEAR_ALIGNED 1 -#define RADEON_SURF_MODE_1D 2 -#define RADEON_SURF_MODE_2D 3 -#define RADEON_SURF_SCANOUT (1 << 16) -#define RADEON_SURF_ZBUFFER (1 << 17) -#define RADEON_SURF_SBUFFER (1 << 18) -#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER) -#define RADEON_SURF_HAS_SBUFFER_MIPTREE (1 << 19) -#define RADEON_SURF_HAS_TILE_MODE_INDEX (1 << 20) -#define RADEON_SURF_FMASK (1 << 21) - -#define RADEON_SURF_GET(v, field) (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK) -#define RADEON_SURF_SET(v, field) (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT) -#define RADEON_SURF_CLR(v, field) ((v) & ~(RADEON_SURF_ ## field ## _MASK << RADEON_SURF_ ## field ## _SHIFT)) - -struct radeon_surf_level { - uint64_t offset; - uint64_t slice_size; - uint32_t npix_x; - uint32_t npix_y; - uint32_t npix_z; - uint32_t nblk_x; - uint32_t nblk_y; - uint32_t nblk_z; - uint32_t pitch_bytes; - uint32_t mode; - uint64_t dcc_offset; -}; - -struct radeon_surf { - /* These are inputs to the calculator. */ - uint32_t npix_x; - uint32_t npix_y; - uint32_t npix_z; - uint32_t blk_w; - uint32_t blk_h; - uint32_t blk_d; - uint32_t array_size; - uint32_t last_level; - uint32_t bpe; - uint32_t nsamples; - uint32_t flags; - - /* These are return values. Some of them can be set by the caller, but - * they will be treated as hints (e.g. bankw, bankh) and might be - * changed by the calculator. - */ - uint64_t bo_size; - uint64_t bo_alignment; - /* This applies to EG and later. */ - uint32_t bankw; - uint32_t bankh; - uint32_t mtilea; - uint32_t tile_split; - uint32_t stencil_tile_split; - uint64_t stencil_offset; - struct radeon_surf_level level[RADEON_SURF_MAX_LEVEL]; - struct radeon_surf_level stencil_level[RADEON_SURF_MAX_LEVEL]; - uint32_t tiling_index[RADEON_SURF_MAX_LEVEL]; - uint32_t stencil_tiling_index[RADEON_SURF_MAX_LEVEL]; - uint32_t pipe_config; - uint32_t num_banks; - uint32_t macro_tile_index; - - uint64_t dcc_size; - uint64_t dcc_alignment; -}; - struct radeon_bo_list_item { - struct pb_buffer *buf; + uint64_t bo_size; uint64_t vm_address; - uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */ + uint32_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */ }; struct radeon_winsys { @@ -436,6 +285,14 @@ struct radeon_winsys { void (*query_info)(struct radeon_winsys *ws, struct radeon_info *info); + /** + * A hint for the winsys that it should pin its execution threads to + * a group of cores sharing a specific L3 cache if the CPU has multiple + * L3 caches. This is needed for good multithreading performance on + * AMD Zen CPUs. + */ + void (*pin_threads_to_L3_cache)(struct radeon_winsys *ws, unsigned cache); + /************************************************************************** * Buffer management. Buffer attributes are mostly fixed over its lifetime. * @@ -464,13 +321,16 @@ struct radeon_winsys { * Map the entire data store of a buffer object into the client's address * space. * + * Callers are expected to unmap buffers again if and only if the + * RADEON_TRANSFER_TEMPORARY flag is set in \p usage. + * * \param buf A winsys buffer object to map. * \param cs A command stream to flush if the buffer is referenced by it. - * \param usage A bitmask of the PIPE_TRANSFER_* flags. + * \param usage A bitmask of the PIPE_TRANSFER_* and RADEON_TRANSFER_* flags. * \return The pointer at the beginning of the buffer. */ void *(*buffer_map)(struct pb_buffer *buf, - struct radeon_winsys_cs *cs, + struct radeon_cmdbuf *cs, enum pipe_transfer_usage usage); /** @@ -522,6 +382,7 @@ struct radeon_winsys { */ struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws, struct winsys_handle *whandle, + unsigned vm_alignment, unsigned *stride, unsigned *offset); /** @@ -543,28 +404,62 @@ struct radeon_winsys { */ bool (*buffer_is_user_ptr)(struct pb_buffer *buf); + /** Whether the buffer was suballocated. */ + bool (*buffer_is_suballocated)(struct pb_buffer *buf); + /** * Get a winsys handle from a winsys buffer. The internal structure * of the handle is platform-specific and only a winsys should access it. * + * \param ws The winsys instance for which the handle is to be valid * \param buf A winsys buffer object to get the handle from. * \param whandle A winsys handle pointer. * \param stride A stride of the buffer in bytes, for texturing. - * \return TRUE on success. + * \return true on success. + */ + bool (*buffer_get_handle)(struct radeon_winsys *ws, + struct pb_buffer *buf, + unsigned stride, unsigned offset, + unsigned slice_size, + struct winsys_handle *whandle); + + /** + * Change the commitment of a (64KB-page aligned) region of the given + * sparse buffer. + * + * \warning There is no automatic synchronization with command submission. + * + * \note Only implemented by the amdgpu winsys. + * + * \return false on out of memory or other failure, true on success. */ - boolean (*buffer_get_handle)(struct pb_buffer *buf, - unsigned stride, unsigned offset, - unsigned slice_size, - struct winsys_handle *whandle); + bool (*buffer_commit)(struct pb_buffer *buf, + uint64_t offset, uint64_t size, + bool commit); /** * Return the virtual address of a buffer. * + * When virtual memory is not in use, this is the offset relative to the + * relocation base (non-zero for sub-allocated buffers). + * * \param buf A winsys buffer object * \return virtual address */ uint64_t (*buffer_get_virtual_address)(struct pb_buffer *buf); + /** + * Return the offset of this buffer relative to the relocation base. + * This is only non-zero for sub-allocated buffers. + * + * This is only supported in the radeon winsys, since amdgpu uses virtual + * addresses in submissions even for the video engines. + * + * \param buf A winsys buffer object + * \return the offset for relocations + */ + unsigned (*buffer_get_reloc_offset)(struct pb_buffer *buf); + /** * Query the initial placement of the buffer from the kernel driver. */ @@ -601,48 +496,36 @@ struct radeon_winsys { * \param flush Flush callback function associated with the command stream. * \param user User pointer that will be passed to the flush callback. */ - struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx, - enum ring_type ring_type, - void (*flush)(void *ctx, unsigned flags, - struct pipe_fence_handle **fence), - void *flush_ctx); + struct radeon_cmdbuf *(*cs_create)(struct radeon_winsys_ctx *ctx, + enum ring_type ring_type, + void (*flush)(void *ctx, unsigned flags, + struct pipe_fence_handle **fence), + void *flush_ctx, + bool stop_exec_on_failure); /** - * Add a constant engine IB to a graphics CS. This makes the graphics CS - * from "cs_create" a group of two IBs that share a buffer list and are - * flushed together. + * Add a parallel compute IB to a gfx IB. It will share the buffer list + * and fence dependencies with the gfx IB. The gfx flush call will submit + * both IBs at the same time. * - * The returned constant CS is only a stream for writing packets to the new - * IB. Calling other winsys functions with it is not allowed, not even - * "cs_destroy". + * The compute IB doesn't have an output fence, so the primary IB has + * to use a wait packet for synchronization. * - * In order to add buffers and check memory usage, use the graphics CS. - * In order to flush it, use the graphics CS, which will flush both IBs. - * Destroying the graphics CS will destroy both of them. + * The returned IB is only a stream for writing packets to the new + * IB. Calling other winsys functions with it is not allowed, not even + * "cs_destroy". Use the gfx IB instead. * - * \param cs The graphics CS from "cs_create" that will hold the buffer - * list and will be used for flushing. + * \param cs Gfx IB */ - struct radeon_winsys_cs *(*cs_add_const_ib)(struct radeon_winsys_cs *cs); + struct radeon_cmdbuf *(*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *cs, + bool uses_gds_ordered_append); - /** - * Add a constant engine preamble IB to a graphics CS. This add an extra IB - * in similar manner to cs_add_const_ib. This should always be called after - * cs_add_const_ib. - * - * The returned IB is a constant engine IB that only gets flushed if the - * context changed. - * - * \param cs The graphics CS from "cs_create" that will hold the buffer - * list and will be used for flushing. - */ - struct radeon_winsys_cs *(*cs_add_const_preamble_ib)(struct radeon_winsys_cs *cs); /** * Destroy a command stream. * * \param cs A command stream to destroy. */ - void (*cs_destroy)(struct radeon_winsys_cs *cs); + void (*cs_destroy)(struct radeon_cmdbuf *cs); /** * Add a buffer. Each buffer used by a CS must be added using this function. @@ -655,7 +538,7 @@ struct radeon_winsys { * placed in the requested domain. 15 is the maximum. * \return Buffer index. */ - unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs, + unsigned (*cs_add_buffer)(struct radeon_cmdbuf *cs, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domain, @@ -664,22 +547,25 @@ struct radeon_winsys { /** * Return the index of an already-added buffer. * + * Not supported on amdgpu. Drivers with GPUVM should not care about + * buffer indices. + * * \param cs Command stream * \param buf Buffer * \return The buffer index, or -1 if the buffer has not been added. */ - int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs, + int (*cs_lookup_buffer)(struct radeon_cmdbuf *cs, struct pb_buffer *buf); /** - * Return TRUE if there is enough memory in VRAM and GTT for the buffers + * Return true if there is enough memory in VRAM and GTT for the buffers * added so far. If the validation fails, all buffers which have * been added since the last call of cs_validate will be removed and * the CS will be flushed (provided there are still any buffers). * * \param cs A command stream to validate. */ - boolean (*cs_validate)(struct radeon_winsys_cs *cs); + bool (*cs_validate)(struct radeon_cmdbuf *cs); /** * Check whether the given number of dwords is available in the IB. @@ -687,52 +573,58 @@ struct radeon_winsys { * * \param cs A command stream. * \param dw Number of CS dwords requested by the caller. + * \param force_chaining Chain the IB into a new buffer now to discard + * the CP prefetch cache (to emulate PKT3_REWIND) + * \return true if there is enough space */ - bool (*cs_check_space)(struct radeon_winsys_cs *cs, unsigned dw); - - /** - * Return TRUE if there is enough memory in VRAM and GTT for the buffers - * added so far. - * - * \param cs A command stream to validate. - * \param vram VRAM memory size pending to be use - * \param gtt GTT memory size pending to be use - */ - boolean (*cs_memory_below_limit)(struct radeon_winsys_cs *cs, uint64_t vram, uint64_t gtt); - - uint64_t (*cs_query_memory_usage)(struct radeon_winsys_cs *cs); + bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw, + bool force_chaining); /** * Return the buffer list. * + * This is the buffer list as passed to the kernel, i.e. it only contains + * the parent buffers of sub-allocated buffers. + * * \param cs Command stream * \param list Returned buffer list. Set to NULL to query the count only. * \return The buffer count. */ - unsigned (*cs_get_buffer_list)(struct radeon_winsys_cs *cs, + unsigned (*cs_get_buffer_list)(struct radeon_cmdbuf *cs, struct radeon_bo_list_item *list); /** * Flush a command stream. * * \param cs A command stream to flush. - * \param flags, RADEON_FLUSH_ASYNC or 0. + * \param flags, PIPE_FLUSH_* flags. * \param fence Pointer to a fence. If non-NULL, a fence is inserted * after the CS and is returned through this parameter. + * \return Negative POSIX error code or 0 for success. + * Asynchronous submissions never return an error. + */ + int (*cs_flush)(struct radeon_cmdbuf *cs, + unsigned flags, + struct pipe_fence_handle **fence); + + /** + * Create a fence before the CS is flushed. + * The user must flush manually to complete the initializaton of the fence. + * + * The fence must not be used for anything except \ref cs_add_fence_dependency + * before the flush. */ - void (*cs_flush)(struct radeon_winsys_cs *cs, - unsigned flags, - struct pipe_fence_handle **fence); + struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_cmdbuf *cs); /** - * Return TRUE if a buffer is referenced by a command stream. + * Return true if a buffer is referenced by a command stream. * * \param cs A command stream. * \param buf A winsys buffer. */ - boolean (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs, - struct pb_buffer *buf, - enum radeon_bo_usage usage); + bool (*cs_is_buffer_referenced)(struct radeon_cmdbuf *cs, + struct pb_buffer *buf, + enum radeon_bo_usage usage); /** * Request access to a feature for a command stream. @@ -741,15 +633,31 @@ struct radeon_winsys { * \param fid Feature ID, one of RADEON_FID_* * \param enable Whether to enable or disable the feature. */ - boolean (*cs_request_feature)(struct radeon_winsys_cs *cs, - enum radeon_feature_id fid, - boolean enable); + bool (*cs_request_feature)(struct radeon_cmdbuf *cs, + enum radeon_feature_id fid, + bool enable); /** * Make sure all asynchronous flush of the cs have completed * * \param cs A command stream. */ - void (*cs_sync_flush)(struct radeon_winsys_cs *cs); + void (*cs_sync_flush)(struct radeon_cmdbuf *cs); + + /** + * Add a fence dependency to the CS, so that the CS will wait for + * the fence before execution. + * + * \param dependency_flags Bitmask of RADEON_DEPENDENCY_* + */ + void (*cs_add_fence_dependency)(struct radeon_cmdbuf *cs, + struct pipe_fence_handle *fence, + unsigned dependency_flags); + + /** + * Signal a syncobj when the CS finishes execution. + */ + void (*cs_add_syncobj_signal)(struct radeon_cmdbuf *cs, + struct pipe_fence_handle *fence); /** * Wait for the fence and return true if the fence has been signalled. @@ -768,21 +676,42 @@ struct radeon_winsys { struct pipe_fence_handle *src); /** - * Initialize surface - * - * \param ws The winsys this function is called from. - * \param surf Surface structure ptr + * Create a new fence object corresponding to the given syncobj fd. */ - int (*surface_init)(struct radeon_winsys *ws, - struct radeon_surf *surf); + struct pipe_fence_handle *(*fence_import_syncobj)(struct radeon_winsys *ws, + int fd); /** - * Find best values for a surface + * Create a new fence object corresponding to the given sync_file. + */ + struct pipe_fence_handle *(*fence_import_sync_file)(struct radeon_winsys *ws, + int fd); + + /** + * Return a sync_file FD corresponding to the given fence object. + */ + int (*fence_export_sync_file)(struct radeon_winsys *ws, + struct pipe_fence_handle *fence); + + /** + * Return a sync file FD that is already signalled. + */ + int (*export_signalled_sync_file)(struct radeon_winsys *ws); + + /** + * Initialize surface * * \param ws The winsys this function is called from. - * \param surf Surface structure ptr + * \param tex Input texture description + * \param flags Bitmask of RADEON_SURF_* flags + * \param bpe Bytes per pixel, it can be different for Z buffers. + * \param mode Preferred tile mode. (linear, 1D, or 2D) + * \param surf Output structure */ - int (*surface_best)(struct radeon_winsys *ws, + int (*surface_init)(struct radeon_winsys *ws, + const struct pipe_resource *tex, + unsigned flags, unsigned bpe, + enum radeon_surf_mode mode, struct radeon_surf *surf); uint64_t (*query_value)(struct radeon_winsys *ws, @@ -792,21 +721,165 @@ struct radeon_winsys { unsigned num_registers, uint32_t *out); }; -static inline bool radeon_emitted(struct radeon_winsys_cs *cs, unsigned num_dw) +static inline bool radeon_emitted(struct radeon_cmdbuf *cs, unsigned num_dw) { return cs && (cs->prev_dw + cs->current.cdw > num_dw); } -static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value) +static inline void radeon_emit(struct radeon_cmdbuf *cs, uint32_t value) { cs->current.buf[cs->current.cdw++] = value; } -static inline void radeon_emit_array(struct radeon_winsys_cs *cs, +static inline void radeon_emit_array(struct radeon_cmdbuf *cs, const uint32_t *values, unsigned count) { memcpy(cs->current.buf + cs->current.cdw, values, count * 4); cs->current.cdw += count; } +enum radeon_heap { + RADEON_HEAP_VRAM_NO_CPU_ACCESS, + RADEON_HEAP_VRAM_READ_ONLY, + RADEON_HEAP_VRAM_READ_ONLY_32BIT, + RADEON_HEAP_VRAM_32BIT, + RADEON_HEAP_VRAM, + RADEON_HEAP_GTT_WC, + RADEON_HEAP_GTT_WC_READ_ONLY, + RADEON_HEAP_GTT_WC_READ_ONLY_32BIT, + RADEON_HEAP_GTT_WC_32BIT, + RADEON_HEAP_GTT, + RADEON_MAX_SLAB_HEAPS, + RADEON_MAX_CACHED_HEAPS = RADEON_MAX_SLAB_HEAPS, +}; + +static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap heap) +{ + switch (heap) { + case RADEON_HEAP_VRAM_NO_CPU_ACCESS: + case RADEON_HEAP_VRAM_READ_ONLY: + case RADEON_HEAP_VRAM_READ_ONLY_32BIT: + case RADEON_HEAP_VRAM_32BIT: + case RADEON_HEAP_VRAM: + return RADEON_DOMAIN_VRAM; + case RADEON_HEAP_GTT_WC: + case RADEON_HEAP_GTT_WC_READ_ONLY: + case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT: + case RADEON_HEAP_GTT_WC_32BIT: + case RADEON_HEAP_GTT: + return RADEON_DOMAIN_GTT; + default: + assert(0); + return (enum radeon_bo_domain)0; + } +} + +static inline unsigned radeon_flags_from_heap(enum radeon_heap heap) +{ + unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING | + (heap != RADEON_HEAP_GTT ? RADEON_FLAG_GTT_WC : 0); + + switch (heap) { + case RADEON_HEAP_VRAM_NO_CPU_ACCESS: + return flags | + RADEON_FLAG_NO_CPU_ACCESS; + + case RADEON_HEAP_VRAM_READ_ONLY: + case RADEON_HEAP_GTT_WC_READ_ONLY: + return flags | + RADEON_FLAG_READ_ONLY; + + case RADEON_HEAP_VRAM_READ_ONLY_32BIT: + case RADEON_HEAP_GTT_WC_READ_ONLY_32BIT: + return flags | + RADEON_FLAG_READ_ONLY | + RADEON_FLAG_32BIT; + + case RADEON_HEAP_VRAM_32BIT: + case RADEON_HEAP_GTT_WC_32BIT: + return flags | + RADEON_FLAG_32BIT; + + case RADEON_HEAP_VRAM: + case RADEON_HEAP_GTT_WC: + case RADEON_HEAP_GTT: + default: + return flags; + } +} + +/* Return the heap index for winsys allocators, or -1 on failure. */ +static inline int radeon_get_heap_index(enum radeon_bo_domain domain, + enum radeon_bo_flag flags) +{ + /* VRAM implies WC (write combining) */ + assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); + /* NO_CPU_ACCESS implies VRAM only. */ + assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM); + + /* Resources with interprocess sharing don't use any winsys allocators. */ + if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)) + return -1; + + /* Unsupported flags: NO_SUBALLOC, SPARSE. */ + if (flags & ~(RADEON_FLAG_GTT_WC | + RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_READ_ONLY | + RADEON_FLAG_32BIT)) + return -1; + + switch (domain) { + case RADEON_DOMAIN_VRAM: + switch (flags & (RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_READ_ONLY | + RADEON_FLAG_32BIT)) { + case RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT: + case RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_READ_ONLY: + assert(!"NO_CPU_ACCESS | READ_ONLY doesn't make sense"); + return -1; + case RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_32BIT: + assert(!"NO_CPU_ACCESS with 32BIT is disallowed"); + return -1; + case RADEON_FLAG_NO_CPU_ACCESS: + return RADEON_HEAP_VRAM_NO_CPU_ACCESS; + case RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT: + return RADEON_HEAP_VRAM_READ_ONLY_32BIT; + case RADEON_FLAG_READ_ONLY: + return RADEON_HEAP_VRAM_READ_ONLY; + case RADEON_FLAG_32BIT: + return RADEON_HEAP_VRAM_32BIT; + case 0: + return RADEON_HEAP_VRAM; + } + break; + case RADEON_DOMAIN_GTT: + switch (flags & (RADEON_FLAG_GTT_WC | + RADEON_FLAG_READ_ONLY | + RADEON_FLAG_32BIT)) { + case RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT: + return RADEON_HEAP_GTT_WC_READ_ONLY_32BIT; + case RADEON_FLAG_GTT_WC | RADEON_FLAG_READ_ONLY: + return RADEON_HEAP_GTT_WC_READ_ONLY; + case RADEON_FLAG_GTT_WC | RADEON_FLAG_32BIT: + return RADEON_HEAP_GTT_WC_32BIT; + case RADEON_FLAG_GTT_WC: + return RADEON_HEAP_GTT_WC; + case RADEON_FLAG_READ_ONLY | RADEON_FLAG_32BIT: + case RADEON_FLAG_READ_ONLY: + assert(!"READ_ONLY without WC is disallowed"); + return -1; + case RADEON_FLAG_32BIT: + assert(!"32BIT without WC is disallowed"); + return -1; + case 0: + return RADEON_HEAP_GTT; + } + break; + default: + break; + } + return -1; +} + #endif