From: Marek Olšák Date: Tue, 15 Feb 2011 04:43:44 +0000 (+0100) Subject: r300g: offload the CS ioctl to another thread X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b9e2cde6006b557a3a23a82384899f4d5a5ac7b8;p=mesa.git r300g: offload the CS ioctl to another thread This is a multi-threading optimization which hides the kernel overhead behind a thread. It improves performance in CPU-limited apps by 2-15%. Of course you must have at least 2 cores for it to make any difference. It can be disabled with: export RADEON_THREAD=0 --- diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c index bfc15ceae72..c4bb332aec3 100644 --- a/src/gallium/drivers/r300/r300_flush.c +++ b/src/gallium/drivers/r300/r300_flush.c @@ -94,6 +94,10 @@ static void r300_flush(struct pipe_context* pipe, r300->rws->cs_flush(r300->cs); } } + + if (flags & PIPE_FLUSH_FRAME) { + r300->rws->cs_sync_flush(r300->cs); + } } void r300_init_flush_functions(struct r300_context* r300) diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h index bf1dd5c980e..6733253ccc9 100644 --- a/src/gallium/drivers/r300/r300_winsys.h +++ b/src/gallium/drivers/r300/r300_winsys.h @@ -45,8 +45,8 @@ struct r300_winsys_screen; struct r300_winsys_cs_handle; /* for write_reloc etc. */ struct r300_winsys_cs { - unsigned cdw; /* Number of used dwords. */ - uint32_t buf[R300_MAX_CMDBUF_DWORDS]; /* The command buffer. */ + unsigned cdw; /* Number of used dwords. */ + uint32_t *buf; /* The command buffer. */ }; enum r300_value_id { @@ -268,6 +268,13 @@ struct r300_winsys_screen { */ void (*cs_flush)(struct r300_winsys_cs *cs); + /** + * Wait until the last flush is completed. + * + * \param cs A command stream. + */ + void (*cs_sync_flush)(struct r300_winsys_cs *cs); + /** * Set a flush callback which is called from winsys when flush is * required. diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index f3c4002883d..afb8131acbe 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -116,6 +116,10 @@ static void radeon_bo_wait(struct r300_winsys_bo *_buf) struct radeon_bo *bo = get_radeon_bo(pb_buffer(_buf)); struct drm_radeon_gem_wait_idle args = {}; + while (p_atomic_read(&bo->num_active_ioctls)) { + sched_yield(); + } + args.handle = bo->handle; while (drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE, &args, sizeof(args)) == -EBUSY); @@ -126,6 +130,10 @@ static boolean radeon_bo_is_busy(struct r300_winsys_bo *_buf) struct radeon_bo *bo = get_radeon_bo(pb_buffer(_buf)); struct drm_radeon_gem_busy args = {}; + if (p_atomic_read(&bo->num_active_ioctls)) { + return TRUE; + } + args.handle = bo->handle; return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY, &args, sizeof(args)) != 0; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h index d877512be58..a26866b7e75 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h @@ -59,6 +59,10 @@ struct radeon_bo { /* how many command streams is this bo referenced in? */ int num_cs_references; + /* how many command streams, which are being emitted in a separate + * thread, is this bo referenced in? */ + int num_active_ioctls; + boolean flinked; uint32_t flink; }; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index 5b2a17c856e..b4f5c9f6a88 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -73,6 +73,63 @@ #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t)) +static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd) +{ + csc->fd = fd; + csc->nrelocs = 512; + csc->relocs_bo = (struct radeon_bo**) + CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*)); + if (!csc->relocs_bo) { + return FALSE; + } + + csc->relocs = (struct drm_radeon_cs_reloc*) + CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc)); + if (!csc->relocs) { + FREE(csc->relocs_bo); + return FALSE; + } + + csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB; + csc->chunks[0].length_dw = 0; + csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf; + csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS; + csc->chunks[1].length_dw = 0; + csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; + + csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0]; + csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1]; + + csc->cs.num_chunks = 2; + csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array; + return TRUE; +} + +static void radeon_cs_context_cleanup(struct radeon_cs_context *csc) +{ + unsigned i; + + for (i = 0; i < csc->crelocs; i++) { + radeon_bo_unref(csc->relocs_bo[i]); + p_atomic_dec(&csc->relocs_bo[i]->num_cs_references); + csc->relocs_bo[i] = NULL; + } + + csc->crelocs = 0; + csc->chunks[0].length_dw = 0; + csc->chunks[1].length_dw = 0; + csc->used_gart = 0; + csc->used_vram = 0; + memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added)); +} + +static void radeon_destroy_cs_context(struct radeon_cs_context *csc) +{ + radeon_cs_context_cleanup(csc); + FREE(csc->relocs_bo); + FREE(csc->relocs); +} + static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rws) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); @@ -84,35 +141,29 @@ static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rw } cs->ws = ws; - cs->nrelocs = 256; - cs->relocs_bo = (struct radeon_bo**) - CALLOC(1, cs->nrelocs * sizeof(struct radeon_bo*)); - if (!cs->relocs_bo) { + + if (!radeon_init_cs_context(&cs->csc1, cs->ws->fd)) { FREE(cs); return NULL; } - - cs->relocs = (struct drm_radeon_cs_reloc*) - CALLOC(1, cs->nrelocs * sizeof(struct drm_radeon_cs_reloc)); - if (!cs->relocs) { - FREE(cs->relocs_bo); + if (!radeon_init_cs_context(&cs->csc2, cs->ws->fd)) { + radeon_destroy_cs_context(&cs->csc1); FREE(cs); return NULL; } - cs->chunks[0].chunk_id = RADEON_CHUNK_ID_IB; - cs->chunks[0].length_dw = 0; - cs->chunks[0].chunk_data = (uint64_t)(uintptr_t)cs->base.buf; - cs->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS; - cs->chunks[1].length_dw = 0; - cs->chunks[1].chunk_data = (uint64_t)(uintptr_t)cs->relocs; + /* Set the first command buffer as current. */ + cs->csc = &cs->csc1; + cs->cst = &cs->csc2; + cs->base.buf = cs->csc->buf; + p_atomic_inc(&ws->num_cs); return &cs->base; } #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value) -static inline void update_domains(struct drm_radeon_cs_reloc *reloc, +static INLINE void update_domains(struct drm_radeon_cs_reloc *reloc, enum r300_buffer_domain rd, enum r300_buffer_domain wd, enum r300_buffer_domain *added_domains) @@ -131,22 +182,22 @@ static inline void update_domains(struct drm_radeon_cs_reloc *reloc, } } -int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo) +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo) { struct drm_radeon_cs_reloc *reloc; unsigned i; - unsigned hash = bo->handle & (sizeof(cs->is_handle_added)-1); + unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); - if (cs->is_handle_added[hash]) { - reloc = cs->relocs_hashlist[hash]; + if (csc->is_handle_added[hash]) { + reloc = csc->relocs_hashlist[hash]; if (reloc->handle == bo->handle) { - return cs->reloc_indices_hashlist[hash]; + return csc->reloc_indices_hashlist[hash]; } /* Hash collision, look for the BO in the list of relocs linearly. */ - for (i = cs->crelocs; i != 0;) { + for (i = csc->crelocs; i != 0;) { --i; - reloc = &cs->relocs[i]; + reloc = &csc->relocs[i]; if (reloc->handle == bo->handle) { /* Put this reloc in the hash list. * This will prevent additional hash collisions if there are @@ -157,8 +208,8 @@ int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo) * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC * will collide here: ^ and here: ^, * meaning that we should get very few collisions in the end. */ - cs->relocs_hashlist[hash] = reloc; - cs->reloc_indices_hashlist[hash] = i; + csc->relocs_hashlist[hash] = reloc; + csc->reloc_indices_hashlist[hash] = i; /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ return i; } @@ -168,7 +219,7 @@ int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo) return -1; } -static void radeon_add_reloc(struct radeon_drm_cs *cs, +static void radeon_add_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo, enum r300_buffer_domain rd, enum r300_buffer_domain wd, @@ -176,24 +227,24 @@ static void radeon_add_reloc(struct radeon_drm_cs *cs, { struct drm_radeon_cs_reloc *reloc; unsigned i; - unsigned hash = bo->handle & (sizeof(cs->is_handle_added)-1); + unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1); - if (cs->is_handle_added[hash]) { - reloc = cs->relocs_hashlist[hash]; + if (csc->is_handle_added[hash]) { + reloc = csc->relocs_hashlist[hash]; if (reloc->handle == bo->handle) { update_domains(reloc, rd, wd, added_domains); return; } /* Hash collision, look for the BO in the list of relocs linearly. */ - for (i = cs->crelocs; i != 0;) { + for (i = csc->crelocs; i != 0;) { --i; - reloc = &cs->relocs[i]; + reloc = &csc->relocs[i]; if (reloc->handle == bo->handle) { update_domains(reloc, rd, wd, added_domains); - cs->relocs_hashlist[hash] = reloc; - cs->reloc_indices_hashlist[hash] = i; + csc->relocs_hashlist[hash] = reloc; + csc->reloc_indices_hashlist[hash] = i; /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/ return; } @@ -201,35 +252,35 @@ static void radeon_add_reloc(struct radeon_drm_cs *cs, } /* New relocation, check if the backing array is large enough. */ - if (cs->crelocs >= cs->nrelocs) { + if (csc->crelocs >= csc->nrelocs) { uint32_t size; - cs->nrelocs += 10; + csc->nrelocs += 10; - size = cs->nrelocs * sizeof(struct radeon_bo*); - cs->relocs_bo = (struct radeon_bo**)realloc(cs->relocs_bo, size); + size = csc->nrelocs * sizeof(struct radeon_bo*); + csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size); - size = cs->nrelocs * sizeof(struct drm_radeon_cs_reloc); - cs->relocs = (struct drm_radeon_cs_reloc*)realloc(cs->relocs, size); + size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc); + csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size); - cs->chunks[1].chunk_data = (uint64_t)(uintptr_t)cs->relocs; + csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs; } /* Initialize the new relocation. */ radeon_bo_ref(bo); p_atomic_inc(&bo->num_cs_references); - cs->relocs_bo[cs->crelocs] = bo; - reloc = &cs->relocs[cs->crelocs]; + csc->relocs_bo[csc->crelocs] = bo; + reloc = &csc->relocs[csc->crelocs]; reloc->handle = bo->handle; reloc->read_domains = rd; reloc->write_domain = wd; reloc->flags = 0; - cs->is_handle_added[hash] = TRUE; - cs->relocs_hashlist[hash] = reloc; - cs->reloc_indices_hashlist[hash] = cs->crelocs; + csc->is_handle_added[hash] = TRUE; + csc->relocs_hashlist[hash] = reloc; + csc->reloc_indices_hashlist[hash] = csc->crelocs; - cs->chunks[1].length_dw += RELOC_DWORDS; - cs->crelocs++; + csc->chunks[1].length_dw += RELOC_DWORDS; + csc->crelocs++; *added_domains = rd | wd; } @@ -243,23 +294,23 @@ static void radeon_drm_cs_add_reloc(struct r300_winsys_cs *rcs, struct radeon_bo *bo = (struct radeon_bo*)buf; enum r300_buffer_domain added_domains; - radeon_add_reloc(cs, bo, rd, wd, &added_domains); + radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains); if (!added_domains) return; if (added_domains & R300_DOMAIN_GTT) - cs->used_gart += bo->size; + cs->csc->used_gart += bo->size; if (added_domains & R300_DOMAIN_VRAM) - cs->used_vram += bo->size; + cs->csc->used_vram += bo->size; } static boolean radeon_drm_cs_validate(struct r300_winsys_cs *rcs) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); - return cs->used_gart < cs->ws->gart_size * 0.8 && - cs->used_vram < cs->ws->vram_size * 0.8; + return cs->csc->used_gart < cs->ws->gart_size * 0.8 && + cs->csc->used_vram < cs->ws->vram_size * 0.8; } static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs, @@ -268,7 +319,7 @@ static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs, struct radeon_drm_cs *cs = radeon_drm_cs(rcs); struct radeon_bo *bo = (struct radeon_bo*)buf; - unsigned index = radeon_get_reloc(cs, bo); + unsigned index = radeon_get_reloc(cs->csc, bo); if (index == -1) { fprintf(stderr, "r300: Cannot get a relocation in %s.\n", __func__); @@ -279,63 +330,89 @@ static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs, OUT_CS(&cs->base, index * RELOC_DWORDS); } -static void radeon_drm_cs_emit(struct r300_winsys_cs *rcs) +static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_async, param) { - struct radeon_drm_cs *cs = radeon_drm_cs(rcs); - uint64_t chunk_array[2]; + struct radeon_cs_context *csc = (struct radeon_cs_context*)param; unsigned i; - int r; - if (cs->base.cdw) { - /* Prepare the arguments. */ - cs->chunks[0].length_dw = cs->base.cdw; - - chunk_array[0] = (uint64_t)(uintptr_t)&cs->chunks[0]; - chunk_array[1] = (uint64_t)(uintptr_t)&cs->chunks[1]; - - cs->cs.num_chunks = 2; - cs->cs.chunks = (uint64_t)(uintptr_t)chunk_array; - - /* Emit. */ - r = drmCommandWriteRead(cs->ws->fd, DRM_RADEON_CS, - &cs->cs, sizeof(struct drm_radeon_cs)); - if (r) { - if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) { - fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n"); - fprintf(stderr, "VENDORID:DEVICEID 0x%04X:0x%04X\n", 0x1002, - cs->ws->pci_id); - for (i = 0; i < cs->base.cdw; i++) { - fprintf(stderr, "0x%08X\n", cs->base.buf[i]); - } - } else { - fprintf(stderr, "radeon: The kernel rejected CS, " - "see dmesg for more information.\n"); + if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS, + &csc->cs, sizeof(struct drm_radeon_cs))) { + if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) { + unsigned i; + + fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n"); + for (i = 0; i < csc->chunks[0].length_dw; i++) { + fprintf(stderr, "0x%08X\n", csc->buf[i]); } + } else { + fprintf(stderr, "radeon: The kernel rejected CS, " + "see dmesg for more information.\n"); } } - /* Unreference buffers, cleanup. */ - for (i = 0; i < cs->crelocs; i++) { - radeon_bo_unref(cs->relocs_bo[i]); - p_atomic_dec(&cs->relocs_bo[i]->num_cs_references); - cs->relocs_bo[i] = NULL; + for (i = 0; i < csc->crelocs; i++) + p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls); + return NULL; +} + +static void radeon_drm_cs_sync_flush(struct r300_winsys_cs *rcs) +{ + struct radeon_drm_cs *cs = radeon_drm_cs(rcs); + + /* Wait for any pending ioctl to complete. */ + if (cs->thread) { + pipe_thread_wait(cs->thread); + cs->thread = 0; } +} + +DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE) + +static void radeon_drm_cs_emit(struct r300_winsys_cs *rcs) +{ + struct radeon_drm_cs *cs = radeon_drm_cs(rcs); + struct radeon_cs_context *tmp; + + radeon_drm_cs_sync_flush(rcs); + + /* If the CS is not empty, emit it in a newly-spawned thread. */ + if (cs->base.cdw) { + unsigned i, crelocs = cs->csc->crelocs; + + cs->csc->chunks[0].length_dw = cs->base.cdw; + + for (i = 0; i < crelocs; i++) + p_atomic_inc(&cs->csc->relocs_bo[i]->num_active_ioctls); + + if (debug_get_option_thread()) { + cs->thread = pipe_thread_create(radeon_drm_cs_emit_async, cs->csc); + assert(cs->thread); + } else { + radeon_drm_cs_emit_async(cs->csc); + } + } + + /* Flip command streams. */ + tmp = cs->csc; + cs->csc = cs->cst; + cs->cst = tmp; + + /* Prepare a new CS. */ + radeon_cs_context_cleanup(cs->csc); + cs->base.buf = cs->csc->buf; cs->base.cdw = 0; - cs->crelocs = 0; - cs->chunks[0].length_dw = 0; - cs->chunks[1].length_dw = 0; - cs->used_gart = 0; - cs->used_vram = 0; - memset(cs->is_handle_added, 0, sizeof(cs->is_handle_added)); } static void radeon_drm_cs_destroy(struct r300_winsys_cs *rcs) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); + radeon_drm_cs_sync_flush(rcs); + radeon_cs_context_cleanup(&cs->csc1); + radeon_cs_context_cleanup(&cs->csc2); p_atomic_dec(&cs->ws->num_cs); - FREE(cs->relocs_bo); - FREE(cs->relocs); + radeon_destroy_cs_context(&cs->csc1); + radeon_destroy_cs_context(&cs->csc2); FREE(cs); } @@ -364,6 +441,7 @@ void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws) ws->base.cs_validate = radeon_drm_cs_validate; ws->base.cs_write_reloc = radeon_drm_cs_write_reloc; ws->base.cs_flush = radeon_drm_cs_emit; + ws->base.cs_sync_flush = radeon_drm_cs_sync_flush; ws->base.cs_set_flush = radeon_drm_cs_set_flush; ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced; } diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h index 0183b877a3e..486fd237fc9 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h @@ -30,34 +30,53 @@ #include "radeon_drm_bo.h" #include -struct radeon_drm_cs { - struct r300_winsys_cs base; - - /* The winsys. */ - struct radeon_drm_winsys *ws; +struct radeon_cs_context { + uint32_t buf[R300_MAX_CMDBUF_DWORDS]; - /* Flush CS. */ - void (*flush_cs)(void *); - void *flush_data; + int fd; + struct drm_radeon_cs cs; + struct drm_radeon_cs_chunk chunks[2]; + uint64_t chunk_array[2]; /* Relocs. */ - unsigned crelocs; unsigned nrelocs; - struct drm_radeon_cs_reloc *relocs; + unsigned crelocs; struct radeon_bo **relocs_bo; - struct drm_radeon_cs cs; - struct drm_radeon_cs_chunk chunks[2]; - - unsigned used_vram; - unsigned used_gart; + struct drm_radeon_cs_reloc *relocs; /* 0 = BO not added, 1 = BO added */ char is_handle_added[256]; struct drm_radeon_cs_reloc *relocs_hashlist[256]; unsigned reloc_indices_hashlist[256]; + + unsigned used_vram; + unsigned used_gart; +}; + +struct radeon_drm_cs { + struct r300_winsys_cs base; + + /* We flip between these two CS. While one is being consumed + * by the kernel in another thread, the other one is being filled + * by the pipe driver. */ + struct radeon_cs_context csc1; + struct radeon_cs_context csc2; + /* The currently-used CS. */ + struct radeon_cs_context *csc; + /* The CS being currently-owned by the other thread. */ + struct radeon_cs_context *cst; + + /* The winsys. */ + struct radeon_drm_winsys *ws; + + /* Flush CS. */ + void (*flush_cs)(void *); + void *flush_data; + + pipe_thread thread; }; -int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo); +int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo); static INLINE struct radeon_drm_cs * radeon_drm_cs(struct r300_winsys_cs *base) @@ -69,7 +88,7 @@ static INLINE boolean radeon_bo_is_referenced_by_cs(struct radeon_drm_cs *cs, struct radeon_bo *bo) { return bo->num_cs_references == bo->rws->num_cs || - (bo->num_cs_references && radeon_get_reloc(cs, bo) != -1); + (bo->num_cs_references && radeon_get_reloc(cs->csc, bo) != -1); } static INLINE boolean radeon_bo_is_referenced_by_any_cs(struct radeon_bo *bo)