#include "radeonsi/si_pipe.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
+#include "util/u_transfer.h"
#include <inttypes.h>
#include <stdio.h>
}
void *si_buffer_map_sync_with_rings(struct si_context *sctx,
- struct r600_resource *resource,
+ struct si_resource *resource,
unsigned usage)
{
enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
}
void si_init_resource_fields(struct si_screen *sscreen,
- struct r600_resource *res,
+ struct si_resource *res,
uint64_t size, unsigned alignment)
{
- struct r600_texture *rtex = (struct r600_texture*)res;
+ struct si_texture *tex = (struct si_texture*)res;
res->bo_size = size;
res->bo_alignment = alignment;
/* Older kernels didn't always flush the HDP cache before
* CS execution
*/
- if (sscreen->info.drm_major == 2 &&
- sscreen->info.drm_minor < 40) {
+ if (!sscreen->info.kernel_flushes_hdp_before_ib) {
res->domains = RADEON_DOMAIN_GTT;
res->flags |= RADEON_FLAG_GTT_WC;
break;
}
if (res->b.b.target == PIPE_BUFFER &&
- res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
- PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
+ res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
/* Use GTT for all persistent mappings with older
* kernels, because they didn't always flush the HDP
* cache before CS execution.
* Write-combined CPU mappings are fine, the kernel
* ensures all CPU writes finish before the GPU
* executes a command stream.
+ *
+ * radeon doesn't have good BO move throttling, so put all
+ * persistent buffers into GTT to prevent VRAM CPU page faults.
*/
- if (sscreen->info.drm_major == 2 &&
- sscreen->info.drm_minor < 40)
+ if (!sscreen->info.kernel_flushes_hdp_before_ib ||
+ !sscreen->info.is_amdgpu)
res->domains = RADEON_DOMAIN_GTT;
}
/* Tiled textures are unmappable. Always put them in VRAM. */
- if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
+ if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
res->domains = RADEON_DOMAIN_VRAM;
res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
}
bool si_alloc_resource(struct si_screen *sscreen,
- struct r600_resource *res)
+ struct si_resource *res)
{
struct pb_buffer *old_buf, *new_buf;
res->gpu_address, res->gpu_address + res->buf->size,
res->buf->size);
}
+
+ if (res->b.b.flags & SI_RESOURCE_FLAG_CLEAR)
+ si_screen_clear_buffer(sscreen, &res->b.b, 0, res->bo_size, 0);
+
return true;
}
static void si_buffer_destroy(struct pipe_screen *screen,
struct pipe_resource *buf)
{
- struct r600_resource *rbuffer = r600_resource(buf);
+ struct si_resource *buffer = si_resource(buf);
threaded_resource_deinit(buf);
- util_range_destroy(&rbuffer->valid_buffer_range);
- pb_reference(&rbuffer->buf, NULL);
- FREE(rbuffer);
+ util_range_destroy(&buffer->valid_buffer_range);
+ pb_reference(&buffer->buf, NULL);
+ FREE(buffer);
}
/* Reallocate the buffer a update all resource bindings where the buffer is
*/
static bool
si_invalidate_buffer(struct si_context *sctx,
- struct r600_resource *rbuffer)
+ struct si_resource *buf)
{
/* Shared buffers can't be reallocated. */
- if (rbuffer->b.is_shared)
+ if (buf->b.is_shared)
return false;
/* Sparse buffers can't be reallocated. */
- if (rbuffer->flags & RADEON_FLAG_SPARSE)
+ if (buf->flags & RADEON_FLAG_SPARSE)
return false;
/* In AMD_pinned_memory, the user pointer association only gets
* broken when the buffer is explicitly re-allocated.
*/
- if (rbuffer->b.is_user_ptr)
+ if (buf->b.is_user_ptr)
return false;
/* Check if mapping this buffer would cause waiting for the GPU. */
- if (si_rings_is_buffer_referenced(sctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
- uint64_t old_va = rbuffer->gpu_address;
-
+ if (si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
/* Reallocate the buffer in the same pipe_resource. */
- si_alloc_resource(sctx->screen, rbuffer);
- si_rebind_buffer(sctx, &rbuffer->b.b, old_va);
+ si_alloc_resource(sctx->screen, buf);
+ si_rebind_buffer(sctx, &buf->b.b);
} else {
- util_range_set_empty(&rbuffer->valid_buffer_range);
+ util_range_set_empty(&buf->valid_buffer_range);
}
return true;
struct pipe_resource *src)
{
struct si_context *sctx = (struct si_context*)ctx;
- struct r600_resource *rdst = r600_resource(dst);
- struct r600_resource *rsrc = r600_resource(src);
- uint64_t old_gpu_address = rdst->gpu_address;
-
- pb_reference(&rdst->buf, rsrc->buf);
- rdst->gpu_address = rsrc->gpu_address;
- rdst->b.b.bind = rsrc->b.b.bind;
- rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads;
- rdst->max_forced_staging_uploads = rsrc->max_forced_staging_uploads;
- rdst->flags = rsrc->flags;
-
- assert(rdst->vram_usage == rsrc->vram_usage);
- assert(rdst->gart_usage == rsrc->gart_usage);
- assert(rdst->bo_size == rsrc->bo_size);
- assert(rdst->bo_alignment == rsrc->bo_alignment);
- assert(rdst->domains == rsrc->domains);
-
- si_rebind_buffer(sctx, dst, old_gpu_address);
+ struct si_resource *sdst = si_resource(dst);
+ struct si_resource *ssrc = si_resource(src);
+
+ pb_reference(&sdst->buf, ssrc->buf);
+ sdst->gpu_address = ssrc->gpu_address;
+ sdst->b.b.bind = ssrc->b.b.bind;
+ sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
+ sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
+ sdst->flags = ssrc->flags;
+
+ assert(sdst->vram_usage == ssrc->vram_usage);
+ assert(sdst->gart_usage == ssrc->gart_usage);
+ assert(sdst->bo_size == ssrc->bo_size);
+ assert(sdst->bo_alignment == ssrc->bo_alignment);
+ assert(sdst->domains == ssrc->domains);
+
+ si_rebind_buffer(sctx, dst);
}
static void si_invalidate_resource(struct pipe_context *ctx,
struct pipe_resource *resource)
{
struct si_context *sctx = (struct si_context*)ctx;
- struct r600_resource *rbuffer = r600_resource(resource);
+ struct si_resource *buf = si_resource(resource);
/* We currently only do anyting here for buffers */
if (resource->target == PIPE_BUFFER)
- (void)si_invalidate_buffer(sctx, rbuffer);
+ (void)si_invalidate_buffer(sctx, buf);
}
static void *si_buffer_get_transfer(struct pipe_context *ctx,
unsigned usage,
const struct pipe_box *box,
struct pipe_transfer **ptransfer,
- void *data, struct r600_resource *staging,
+ void *data, struct si_resource *staging,
unsigned offset)
{
struct si_context *sctx = (struct si_context*)ctx;
- struct r600_transfer *transfer;
+ struct si_transfer *transfer;
if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
transfer = slab_alloc(&sctx->pool_transfers_unsync);
struct pipe_transfer **ptransfer)
{
struct si_context *sctx = (struct si_context*)ctx;
- struct r600_resource *rbuffer = r600_resource(resource);
+ struct si_resource *buf = si_resource(resource);
uint8_t *data;
assert(box->x + box->width <= resource->width0);
*
* So don't ever use staging buffers.
*/
- if (rbuffer->b.is_user_ptr)
+ if (buf->b.is_user_ptr)
usage |= PIPE_TRANSFER_PERSISTENT;
/* See if the buffer range being mapped has never been initialized,
if (!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED)) &&
usage & PIPE_TRANSFER_WRITE &&
- !rbuffer->b.is_shared &&
- !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
+ !buf->b.is_shared &&
+ !util_ranges_intersect(&buf->valid_buffer_range, box->x, box->x + box->width)) {
usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
}
!(usage & PIPE_TRANSFER_PERSISTENT) &&
/* Try not to decrement the counter if it's not positive. Still racy,
* but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
- rbuffer->max_forced_staging_uploads > 0 &&
- p_atomic_dec_return(&rbuffer->max_forced_staging_uploads) >= 0) {
+ buf->max_forced_staging_uploads > 0 &&
+ p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
PIPE_TRANSFER_UNSYNCHRONIZED);
usage |= PIPE_TRANSFER_DISCARD_RANGE;
TC_TRANSFER_MAP_NO_INVALIDATE))) {
assert(usage & PIPE_TRANSFER_WRITE);
- if (si_invalidate_buffer(sctx, rbuffer)) {
+ if (si_invalidate_buffer(sctx, buf)) {
/* At this point, the buffer is always idle. */
usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
} else {
}
}
- if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+ if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&
+ buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+ usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
+ PIPE_TRANSFER_PERSISTENT);
+ usage |= PIPE_TRANSFER_DISCARD_RANGE;
+ force_discard_range = true;
+ }
+
+ if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
PIPE_TRANSFER_PERSISTENT))) ||
- (rbuffer->flags & RADEON_FLAG_SPARSE))) {
+ (buf->flags & RADEON_FLAG_SPARSE))) {
assert(usage & PIPE_TRANSFER_WRITE);
/* Check if mapping this buffer would cause waiting for the GPU.
*/
- if (rbuffer->flags & RADEON_FLAG_SPARSE ||
+ if (buf->flags & RADEON_FLAG_SPARSE ||
force_discard_range ||
- si_rings_is_buffer_referenced(sctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
- !sctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
+ si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
+ !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
/* Do a wait-free write-only transfer using a temporary buffer. */
+ struct u_upload_mgr *uploader;
+ struct si_resource *staging = NULL;
unsigned offset;
- struct r600_resource *staging = NULL;
- u_upload_alloc(ctx->stream_uploader, 0,
+ /* If we are not called from the driver thread, we have
+ * to use the uploader from u_threaded_context, which is
+ * local to the calling thread.
+ */
+ if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+ uploader = sctx->tc->base.stream_uploader;
+ else
+ uploader = sctx->b.stream_uploader;
+
+ u_upload_alloc(uploader, 0,
box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
sctx->screen->info.tcc_cache_line_size,
&offset, (struct pipe_resource**)&staging,
data += box->x % SI_MAP_BUFFER_ALIGNMENT;
return si_buffer_get_transfer(ctx, resource, usage, box,
ptransfer, data, staging, offset);
- } else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+ } else if (buf->flags & RADEON_FLAG_SPARSE) {
return NULL;
}
} else {
/* Use a staging buffer in cached GTT for reads. */
else if (((usage & PIPE_TRANSFER_READ) &&
!(usage & PIPE_TRANSFER_PERSISTENT) &&
- (rbuffer->domains & RADEON_DOMAIN_VRAM ||
- rbuffer->flags & RADEON_FLAG_GTT_WC)) ||
- (rbuffer->flags & RADEON_FLAG_SPARSE)) {
- struct r600_resource *staging;
+ (buf->domains & RADEON_DOMAIN_VRAM ||
+ buf->flags & RADEON_FLAG_GTT_WC)) ||
+ (buf->flags & RADEON_FLAG_SPARSE)) {
+ struct si_resource *staging;
assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
- staging = (struct r600_resource*) pipe_buffer_create(
+ staging = si_resource(pipe_buffer_create(
ctx->screen, 0, PIPE_USAGE_STAGING,
- box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT));
+ box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
if (staging) {
/* Copy the VRAM buffer to the staging buffer. */
sctx->dma_copy(ctx, &staging->b.b, 0,
data = si_buffer_map_sync_with_rings(sctx, staging,
usage & ~PIPE_TRANSFER_UNSYNCHRONIZED);
if (!data) {
- r600_resource_reference(&staging, NULL);
+ si_resource_reference(&staging, NULL);
return NULL;
}
data += box->x % SI_MAP_BUFFER_ALIGNMENT;
return si_buffer_get_transfer(ctx, resource, usage, box,
ptransfer, data, staging, 0);
- } else if (rbuffer->flags & RADEON_FLAG_SPARSE) {
+ } else if (buf->flags & RADEON_FLAG_SPARSE) {
return NULL;
}
}
- data = si_buffer_map_sync_with_rings(sctx, rbuffer, usage);
+ data = si_buffer_map_sync_with_rings(sctx, buf, usage);
if (!data) {
return NULL;
}
struct pipe_transfer *transfer,
const struct pipe_box *box)
{
- struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
- struct r600_resource *rbuffer = r600_resource(transfer->resource);
+ struct si_context *sctx = (struct si_context*)ctx;
+ struct si_transfer *stransfer = (struct si_transfer*)transfer;
+ struct si_resource *buf = si_resource(transfer->resource);
+
+ if (stransfer->staging) {
+ unsigned src_offset = stransfer->offset +
+ transfer->box.x % SI_MAP_BUFFER_ALIGNMENT +
+ (box->x - transfer->box.x);
+
+ if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
+ /* This should be true for all uploaders. */
+ assert(transfer->box.x == 0);
+
+ /* Find a previous upload and extend its range. The last
+ * upload is likely to be at the end of the list.
+ */
+ for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+ struct si_sdma_upload *up = &sctx->sdma_uploads[i];
+
+ if (up->dst != buf)
+ continue;
+
+ assert(up->src == stransfer->staging);
+ assert(box->x > up->dst_offset);
+ up->size = box->x + box->width - up->dst_offset;
+ return;
+ }
- if (rtransfer->staging) {
- struct pipe_resource *dst, *src;
- unsigned soffset;
- struct pipe_box dma_box;
+ /* Enlarge the array if it's full. */
+ if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+ unsigned size;
- dst = transfer->resource;
- src = &rtransfer->staging->b.b;
- soffset = rtransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT;
+ sctx->max_sdma_uploads += 4;
+ size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
+ sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
+ }
- u_box_1d(soffset, box->width, &dma_box);
+ /* Add a new upload. */
+ struct si_sdma_upload *up =
+ &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+ up->dst = up->src = NULL;
+ si_resource_reference(&up->dst, buf);
+ si_resource_reference(&up->src, stransfer->staging);
+ up->dst_offset = box->x;
+ up->src_offset = src_offset;
+ up->size = box->width;
+ return;
+ }
/* Copy the staging buffer into the original one. */
- ctx->resource_copy_region(ctx, dst, 0, box->x, 0, 0, src, 0, &dma_box);
+ si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b,
+ box->x, src_offset, box->width);
}
- util_range_add(&rbuffer->valid_buffer_range, box->x,
+ util_range_add(&buf->valid_buffer_range, box->x,
box->x + box->width);
}
struct pipe_transfer *transfer)
{
struct si_context *sctx = (struct si_context*)ctx;
- struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+ struct si_transfer *stransfer = (struct si_transfer*)transfer;
if (transfer->usage & PIPE_TRANSFER_WRITE &&
!(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
si_buffer_do_flush_region(ctx, transfer, &transfer->box);
- r600_resource_reference(&rtransfer->staging, NULL);
- assert(rtransfer->b.staging == NULL); /* for threaded context only */
+ si_resource_reference(&stransfer->staging, NULL);
+ assert(stransfer->b.staging == NULL); /* for threaded context only */
pipe_resource_reference(&transfer->resource, NULL);
/* Don't use pool_transfers_unsync. We are always in the driver
si_buffer_transfer_unmap, /* transfer_unmap */
};
-static struct r600_resource *
+static struct si_resource *
si_alloc_buffer_struct(struct pipe_screen *screen,
const struct pipe_resource *templ)
{
- struct r600_resource *rbuffer;
+ struct si_resource *buf;
- rbuffer = MALLOC_STRUCT(r600_resource);
+ buf = MALLOC_STRUCT(si_resource);
- rbuffer->b.b = *templ;
- rbuffer->b.b.next = NULL;
- pipe_reference_init(&rbuffer->b.b.reference, 1);
- rbuffer->b.b.screen = screen;
+ buf->b.b = *templ;
+ buf->b.b.next = NULL;
+ pipe_reference_init(&buf->b.b.reference, 1);
+ buf->b.b.screen = screen;
- rbuffer->b.vtbl = &si_buffer_vtbl;
- threaded_resource_init(&rbuffer->b.b);
+ buf->b.vtbl = &si_buffer_vtbl;
+ threaded_resource_init(&buf->b.b);
- rbuffer->buf = NULL;
- rbuffer->bind_history = 0;
- rbuffer->TC_L2_dirty = false;
- util_range_init(&rbuffer->valid_buffer_range);
- return rbuffer;
+ buf->buf = NULL;
+ buf->bind_history = 0;
+ buf->TC_L2_dirty = false;
+ util_range_init(&buf->valid_buffer_range);
+ return buf;
}
static struct pipe_resource *si_buffer_create(struct pipe_screen *screen,
unsigned alignment)
{
struct si_screen *sscreen = (struct si_screen*)screen;
- struct r600_resource *rbuffer = si_alloc_buffer_struct(screen, templ);
+ struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
- rbuffer->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
+ buf->b.b.flags |= SI_RESOURCE_FLAG_UNMAPPABLE;
- si_init_resource_fields(sscreen, rbuffer, templ->width0, alignment);
+ si_init_resource_fields(sscreen, buf, templ->width0, alignment);
if (templ->flags & PIPE_RESOURCE_FLAG_SPARSE)
- rbuffer->flags |= RADEON_FLAG_SPARSE;
+ buf->flags |= RADEON_FLAG_SPARSE;
- if (!si_alloc_resource(sscreen, rbuffer)) {
- FREE(rbuffer);
+ if (!si_alloc_resource(sscreen, buf)) {
+ FREE(buf);
return NULL;
}
- return &rbuffer->b.b;
+ return &buf->b.b;
}
-struct pipe_resource *si_aligned_buffer_create(struct pipe_screen *screen,
- unsigned flags,
- unsigned usage,
- unsigned size,
- unsigned alignment)
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
+ unsigned flags, unsigned usage,
+ unsigned size, unsigned alignment)
{
struct pipe_resource buffer;
return si_buffer_create(screen, &buffer, alignment);
}
+struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen,
+ unsigned flags, unsigned usage,
+ unsigned size, unsigned alignment)
+{
+ return si_resource(pipe_aligned_buffer_create(screen, flags, usage,
+ size, alignment));
+}
+
static struct pipe_resource *
si_buffer_from_user_memory(struct pipe_screen *screen,
const struct pipe_resource *templ,
{
struct si_screen *sscreen = (struct si_screen*)screen;
struct radeon_winsys *ws = sscreen->ws;
- struct r600_resource *rbuffer = si_alloc_buffer_struct(screen, templ);
+ struct si_resource *buf = si_alloc_buffer_struct(screen, templ);
- rbuffer->domains = RADEON_DOMAIN_GTT;
- rbuffer->flags = 0;
- rbuffer->b.is_user_ptr = true;
- util_range_add(&rbuffer->valid_buffer_range, 0, templ->width0);
- util_range_add(&rbuffer->b.valid_buffer_range, 0, templ->width0);
+ buf->domains = RADEON_DOMAIN_GTT;
+ buf->flags = 0;
+ buf->b.is_user_ptr = true;
+ util_range_add(&buf->valid_buffer_range, 0, templ->width0);
+ util_range_add(&buf->b.valid_buffer_range, 0, templ->width0);
/* Convert a user pointer to a buffer. */
- rbuffer->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
- if (!rbuffer->buf) {
- FREE(rbuffer);
+ buf->buf = ws->buffer_from_ptr(ws, user_memory, templ->width0);
+ if (!buf->buf) {
+ FREE(buf);
return NULL;
}
- rbuffer->gpu_address = ws->buffer_get_virtual_address(rbuffer->buf);
- rbuffer->vram_usage = 0;
- rbuffer->gart_usage = templ->width0;
+ buf->gpu_address = ws->buffer_get_virtual_address(buf->buf);
+ buf->vram_usage = 0;
+ buf->gart_usage = templ->width0;
- return &rbuffer->b.b;
+ return &buf->b.b;
}
static struct pipe_resource *si_resource_create(struct pipe_screen *screen,
bool commit)
{
struct si_context *ctx = (struct si_context *)pctx;
- struct r600_resource *res = r600_resource(resource);
+ struct si_resource *res = si_resource(resource);
/*
* Since buffer commitment changes cannot be pipelined, we need to