From: Eric Anholt Date: Fri, 4 Oct 2013 03:03:41 +0000 (-0700) Subject: i965: Try to avoid stalls on the GPU when doing glBufferSubData(). X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=c298f5ff5681dd7c3cf0bf7c37a6f22430deeb91;p=mesa.git i965: Try to avoid stalls on the GPU when doing glBufferSubData(). On DOTA2, framerate on dota2-de1.dem in windowed mode on my laptop improves by 7.69854% +/- 0.909163% (n=3). In a microbenchmark hitting this code path (wall time of piglit vbo-subdata-many), runtime decreases from 0.8 to 0.05 seconds. v2: Use out of range start/end instead of separate bool for the active flag (suggestion by Jordan), fix double-upload in the stalling path. Reviewed-by: Jordan Justen --- diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index e85ad69719a..4da1b7eb410 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -472,12 +472,30 @@ static void brw_prepare_vertices(struct brw_context *brw) struct brw_vertex_buffer *buffer = &brw->vb.buffers[j]; /* Named buffer object: Just reference its contents directly. */ - buffer->bo = intel_bufferobj_buffer(brw, intel_buffer, INTEL_READ); - drm_intel_bo_reference(buffer->bo); buffer->offset = (uintptr_t)glarray->Ptr; buffer->stride = glarray->StrideB; buffer->step_rate = glarray->InstanceDivisor; + uint32_t offset, size; + if (glarray->InstanceDivisor) { + offset = buffer->offset; + size = (buffer->stride * ((brw->num_instances / + glarray->InstanceDivisor) - 1) + + glarray->_ElementSize); + } else { + if (min_index == -1) { + offset = 0; + size = intel_buffer->Base.Size; + } else { + offset = buffer->offset + min_index * buffer->stride; + size = (buffer->stride * (max_index - min_index) + + glarray->_ElementSize); + } + } + buffer->bo = intel_bufferobj_buffer(brw, intel_buffer, + offset, size); + drm_intel_bo_reference(buffer->bo); + input->buffer = j++; input->offset = 0; } @@ -850,7 +868,7 @@ static void brw_upload_indices(struct brw_context *brw) brw->ib.start_vertex_offset = offset / ib_type_size; bo = intel_bufferobj_buffer(brw, intel_buffer_object(bufferobj), - INTEL_READ); + offset, ib_size); drm_intel_bo_reference(bo); } } diff --git a/src/mesa/drivers/dri/i965/brw_object_purgeable.c b/src/mesa/drivers/dri/i965/brw_object_purgeable.c index 46304167b22..2567de75900 100644 --- a/src/mesa/drivers/dri/i965/brw_object_purgeable.c +++ b/src/mesa/drivers/dri/i965/brw_object_purgeable.c @@ -62,10 +62,7 @@ intel_buffer_object_purgeable(struct gl_context * ctx, return GL_RELEASED_APPLE; } else { /* XXX Create the buffer and madvise(MADV_DONTNEED)? */ - struct brw_context *brw = brw_context(ctx); - drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_obj, INTEL_READ); - - return intel_buffer_purgeable(bo); + return intel_buffer_purgeable(intel_obj->buffer); } } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index a78ebf00405..744821b846a 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -239,8 +239,8 @@ brw_update_buffer_texture_surface(struct gl_context *ctx, int texel_size = _mesa_get_format_bytes(format); if (intel_obj) { - bo = intel_obj->buffer; size = MIN2(size, intel_obj->Base.Size); + bo = intel_bufferobj_buffer(brw, intel_obj, tObj->BufferOffset, size); } if (brw_format == 0 && format != MESA_FORMAT_RGBA_FLOAT32) { @@ -345,11 +345,13 @@ brw_update_sol_surface(struct brw_context *brw, unsigned stride_dwords, unsigned offset_dwords) { struct intel_buffer_object *intel_bo = intel_buffer_object(buffer_obj); - drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo, INTEL_WRITE_PART); + uint32_t offset_bytes = 4 * offset_dwords; + drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo, + offset_bytes, + buffer_obj->Size - offset_bytes); uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 6 * 4, 32, out_offset); uint32_t pitch_minus_1 = 4*stride_dwords - 1; - uint32_t offset_bytes = 4 * offset_dwords; size_t size_dwords = buffer_obj->Size / 4; uint32_t buffer_size_minus_1, width, height, depth, surface_format; @@ -824,7 +826,10 @@ brw_upload_ubo_surfaces(struct brw_context *brw, binding = &ctx->UniformBufferBindings[shader->UniformBlocks[i].Binding]; intel_bo = intel_buffer_object(binding->BufferObject); - drm_intel_bo *bo = intel_bufferobj_buffer(brw, intel_bo, INTEL_READ); + drm_intel_bo *bo = + intel_bufferobj_buffer(brw, intel_bo, + binding->Offset, + binding->BufferObject->Size - binding->Offset); /* Because behavior for referencing outside of the binding's size in the * glBindBufferRange case is undefined, we can just bind the whole buffer diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c index fc69bfc261e..ec46b56e78d 100644 --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c @@ -73,12 +73,12 @@ upload_3dstate_so_buffers(struct brw_context *brw) continue; } - bo = intel_bufferobj_buffer(brw, bufferobj, INTEL_WRITE_PART); stride = linked_xfb_info->BufferStride[i] * 4; start = xfb_obj->Offset[i]; assert(start % 4 == 0); end = ALIGN(start + xfb_obj->Size[i], 4); + bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start); assert(end <= bo->size); BEGIN_BATCH(4); diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c index 9880c14dd1b..1f0c75fc2ef 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c @@ -278,8 +278,8 @@ gen7_update_buffer_texture_surface(struct gl_context *ctx, drm_intel_bo *bo = NULL; if (intel_obj) { - bo = intel_obj->buffer; size = MIN2(size, intel_obj->Base.Size); + bo = intel_bufferobj_buffer(brw, intel_obj, tObj->BufferOffset, size); } gl_format format = tObj->_BufferObjectFormat; diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c index 8a2472766db..779198f9e51 100644 --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c @@ -44,6 +44,21 @@ static GLboolean intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj); +static void +intel_bufferobj_mark_gpu_usage(struct intel_buffer_object *intel_obj, + uint32_t offset, uint32_t size) +{ + intel_obj->gpu_active_start = MIN2(intel_obj->gpu_active_start, offset); + intel_obj->gpu_active_end = MAX2(intel_obj->gpu_active_end, offset + size); +} + +static void +intel_bufferobj_mark_inactive(struct intel_buffer_object *intel_obj) +{ + intel_obj->gpu_active_start = ~0; + intel_obj->gpu_active_end = 0; +} + /** Allocates a new drm_intel_bo to store the data for the buffer object. */ static void intel_bufferobj_alloc_buffer(struct brw_context *brw, @@ -55,6 +70,8 @@ intel_bufferobj_alloc_buffer(struct brw_context *brw, /* the buffer might be bound as a uniform buffer, need to update it */ brw->state.dirty.brw |= BRW_NEW_UNIFORM_BUFFER; + + intel_bufferobj_mark_inactive(intel_obj); } static void @@ -179,20 +196,44 @@ intel_bufferobj_subdata(struct gl_context * ctx, assert(intel_obj); + /* See if we can unsynchronized write the data into the user's BO. This + * avoids GPU stalls in unfortunately common user patterns (uploading + * sequentially into a BO, with draw calls in between each upload). + * + * Once we've hit this path, we mark this GL BO as preferring stalling to + * blits, so that we can hopefully hit this path again in the future + * (otherwise, an app that might occasionally stall but mostly not will end + * up with blitting all the time, at the cost of bandwidth) + */ + if (brw->has_llc) { + if (offset + size <= intel_obj->gpu_active_start || + intel_obj->gpu_active_end <= offset) { + drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer); + memcpy(intel_obj->buffer->virtual + offset, data, size); + drm_intel_bo_unmap(intel_obj->buffer); + + if (intel_obj->gpu_active_end > intel_obj->gpu_active_start) + intel_obj->prefer_stall_to_blit = true; + return; + } + } + busy = drm_intel_bo_busy(intel_obj->buffer) || drm_intel_bo_references(brw->batch.bo, intel_obj->buffer); if (busy) { if (size == intel_obj->Base.Size) { - /* Replace the current busy bo with fresh data. */ + /* Replace the current busy bo so the subdata doesn't stall. */ drm_intel_bo_unreference(intel_obj->buffer); intel_bufferobj_alloc_buffer(brw, intel_obj); - drm_intel_bo_subdata(intel_obj->buffer, 0, size, data); - } else { - perf_debug("Using a blit copy to avoid stalling on %ldb " - "glBufferSubData() to a busy buffer object.\n", - (long)size); + } else if (!intel_obj->prefer_stall_to_blit) { + perf_debug("Using a blit copy to avoid stalling on " + "glBufferSubData(%ld, %ld) (%ldkb) to a busy " + "(%d-%d) buffer object.\n", + (long)offset, (long)offset + size, (long)(size/1024), + intel_obj->gpu_active_start, + intel_obj->gpu_active_end); drm_intel_bo *temp_bo = drm_intel_bo_alloc(brw->bufmgr, "subdata temp", size, 64); @@ -204,10 +245,20 @@ intel_bufferobj_subdata(struct gl_context * ctx, size); drm_intel_bo_unreference(temp_bo); + return; + } else { + perf_debug("Stalling on glBufferSubData(%ld, %ld) (%ldkb) to a busy " + "(%d-%d) buffer object. Use glMapBufferRange() to " + "avoid this.\n", + (long)offset, (long)offset + size, (long)(size/1024), + intel_obj->gpu_active_start, + intel_obj->gpu_active_end); + intel_batchbuffer_flush(brw); } - } else { - drm_intel_bo_subdata(intel_obj->buffer, offset, size, data); } + + drm_intel_bo_subdata(intel_obj->buffer, offset, size, data); + intel_bufferobj_mark_inactive(intel_obj); } @@ -231,6 +282,8 @@ intel_bufferobj_get_subdata(struct gl_context * ctx, intel_batchbuffer_flush(brw); } drm_intel_bo_get_subdata(intel_obj->buffer, offset, size, data); + + intel_bufferobj_mark_inactive(intel_obj); } @@ -328,8 +381,10 @@ intel_bufferobj_map_range(struct gl_context * ctx, drm_intel_gem_bo_map_unsynchronized(intel_obj->buffer); else if (!(access & GL_MAP_READ_BIT)) { drm_intel_gem_bo_map_gtt(intel_obj->buffer); + intel_bufferobj_mark_inactive(intel_obj); } else { drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0); + intel_bufferobj_mark_inactive(intel_obj); } obj->Pointer = intel_obj->buffer->virtual + offset; @@ -375,6 +430,7 @@ intel_bufferobj_flush_mapped_range(struct gl_context *ctx, intel_obj->buffer, obj->Offset + offset, temp_bo, 0, length); + intel_bufferobj_mark_gpu_usage(intel_obj, obj->Offset + offset, length); drm_intel_bo_unreference(temp_bo); } @@ -409,6 +465,7 @@ intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj) intel_obj->buffer, obj->Offset, intel_obj->range_map_bo, 0, obj->Length); + intel_bufferobj_mark_gpu_usage(intel_obj, obj->Offset, obj->Length); /* Since we've emitted some blits to buffers that will (likely) be used * in rendering operations in other cache domains in this batch, emit a @@ -429,10 +486,17 @@ intel_bufferobj_unmap(struct gl_context * ctx, struct gl_buffer_object *obj) return true; } +/** + * Gets a pointer to the object's BO, and marks the given range as being used + * on the GPU. + * + * Anywhere that uses buffer objects in the pipeline should be using this to + * mark the range of the buffer that is being accessed by the pipeline. + */ drm_intel_bo * intel_bufferobj_buffer(struct brw_context *brw, struct intel_buffer_object *intel_obj, - GLuint flag) + uint32_t offset, uint32_t size) { /* This is needed so that things like transform feedback and texture buffer * objects that need a BO but don't want to check that they exist for @@ -441,6 +505,8 @@ intel_bufferobj_buffer(struct brw_context *brw, if (intel_obj->buffer == NULL) intel_bufferobj_alloc_buffer(brw, intel_obj); + intel_bufferobj_mark_gpu_usage(intel_obj, offset, size); + return intel_obj->buffer; } @@ -466,8 +532,8 @@ intel_bufferobj_copy_subdata(struct gl_context *ctx, if (size == 0) return; - dst_bo = intel_bufferobj_buffer(brw, intel_dst, INTEL_WRITE_PART); - src_bo = intel_bufferobj_buffer(brw, intel_src, INTEL_READ); + dst_bo = intel_bufferobj_buffer(brw, intel_dst, write_offset, size); + src_bo = intel_bufferobj_buffer(brw, intel_src, read_offset, size); intel_emit_linear_blit(brw, dst_bo, write_offset, diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h b/src/mesa/drivers/dri/i965/intel_buffer_objects.h index cf01e2da1b5..49aa3aa53b5 100644 --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h @@ -45,14 +45,45 @@ struct intel_buffer_object drm_intel_bo *range_map_bo; void *range_map_buffer; unsigned int range_map_offset; + + /** @{ + * Tracking for what range of the BO may currently be in use by the GPU. + * + * Users often want to either glBufferSubData() or glMapBufferRange() a + * buffer object where some subset of it is busy on the GPU, without either + * stalling or doing an extra blit (since our blits are extra expensive, + * given that we have to reupload most of the 3D state when switching + * rings). We wish they'd just use glMapBufferRange() with the + * UNSYNC|INVALIDATE_RANGE flag or the INVALIDATE_BUFFER flag, but lots + * don't. + * + * To work around apps, we track what range of the BO we might have used on + * the GPU as vertex data, tranform feedback output, buffer textures, etc., + * and just do glBufferSubData() with an unsynchronized map when they're + * outside of that range. + * + * If gpu_active_start > gpu_active_end, then the GPU is not currently + * accessing the BO (and we can map it without synchronization). + */ + uint32_t gpu_active_start; + uint32_t gpu_active_end; + + /** + * If we've avoided stalls/blits using the active tracking, flag the buffer + * for (occasional) stalling in the future to avoid getting stuck in a + * cycle of blitting on buffer wraparound. + */ + bool prefer_stall_to_blit; + /** @} */ }; /* Get the bm buffer associated with a GL bufferobject: */ drm_intel_bo *intel_bufferobj_buffer(struct brw_context *brw, - struct intel_buffer_object *obj, - GLuint flag); + struct intel_buffer_object *obj, + uint32_t offset, + uint32_t size); void intel_upload_data(struct brw_context *brw, const void *ptr, GLuint size, GLuint align, diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c index d9f48dd9628..08cb7624bb1 100644 --- a/src/mesa/drivers/dri/i965/intel_pixel_read.c +++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c @@ -79,7 +79,6 @@ do_blit_readpixels(struct gl_context * ctx, struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj); GLuint dst_offset; drm_intel_bo *dst_buffer; - bool all; GLint dst_x, dst_y; GLuint dirty; @@ -127,12 +126,9 @@ do_blit_readpixels(struct gl_context * ctx, intel_prepare_render(brw); brw->front_buffer_dirty = dirty; - all = (width * height * irb->mt->cpp == dst->Base.Size && - x == 0 && dst_offset == 0); - dst_buffer = intel_bufferobj_buffer(brw, dst, - all ? INTEL_WRITE_FULL : - INTEL_WRITE_PART); + dst_offset, width * height * + irb->mt->cpp); struct intel_mipmap_tree *pbo_mt = intel_miptree_create_for_bo(brw, diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c index c5d99e19df5..cc50f84ea32 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_image.c +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c @@ -126,13 +126,14 @@ try_pbo_upload(struct gl_context *ctx, return false; } - src_buffer = intel_bufferobj_buffer(brw, pbo, INTEL_READ); - /* note: potential 64-bit ptr to 32-bit int cast */ - src_offset = (GLuint) (unsigned long) pixels; - int src_stride = _mesa_image_row_stride(unpack, image->Width, format, type); + /* note: potential 64-bit ptr to 32-bit int cast */ + src_offset = (GLuint) (unsigned long) pixels; + src_buffer = intel_bufferobj_buffer(brw, pbo, + src_offset, src_stride * image->Height); + struct intel_mipmap_tree *pbo_mt = intel_miptree_create_for_bo(brw, src_buffer,