i965: Massively simplify the intel_upload implementation.
authorEric Anholt <eric@anholt.net>
Thu, 24 Jan 2013 01:05:10 +0000 (17:05 -0800)
committerEric Anholt <eric@anholt.net>
Wed, 26 Mar 2014 20:13:26 +0000 (13:13 -0700)
The implementation kept a page-sized area for uploading data, and
uploaded chunks from that to a 64kb-sized streamed buffer.  This wasted
cache footprint (and extra state tracking to do so) when we want to just
write our data into the buffer immediately.

Instead, build it around an interface like brw_state_batch() that just
gets you a pointer to BO memory to upload your stuff immediately.

Improves OpenArena on HSW by 1.62209% +/- 0.355299% (n=61) and on BYT by
1.7916% +/- 0.415743% (n=31).

v2: Rebase on Mesa master, drop old prototypes.  Re-do performance
    comparison on a kernel that doesn't punish CPU efficiency
    improvements.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/mesa/drivers/dri/i965/brw_context.h
src/mesa/drivers/dri/i965/brw_draw_upload.c
src/mesa/drivers/dri/i965/intel_buffer_objects.h
src/mesa/drivers/dri/i965/intel_upload.c

index 04af5d04d3a1bd7885d7a97a932266cb115d58dc..e1196824571624c6989d9e90a1af8151561196b8 100644 (file)
@@ -1041,10 +1041,7 @@ struct brw_context
 
    struct {
       drm_intel_bo *bo;
-      GLuint offset;
-      uint32_t buffer_len;
-      uint32_t buffer_offset;
-      char buffer[4096];
+      uint32_t next_offset;
    } upload;
 
    /**
index e261163e765e2ee3d08fb6fd17f15a22f76ee348..a579025508c4d9c298831b3609c85cf3a382aa09 100644 (file)
@@ -381,21 +381,17 @@ copy_array_to_vbo_array(struct brw_context *brw,
    const unsigned char *src = element->glarray->Ptr + min * src_stride;
    int count = max - min + 1;
    GLuint size = count * dst_stride;
+   uint8_t *dst = intel_upload_space(brw, size, dst_stride,
+                                     &buffer->bo, &buffer->offset);
 
    if (dst_stride == src_stride) {
-      intel_upload_data(brw, src, size, dst_stride,
-                       &buffer->bo, &buffer->offset);
+      memcpy(dst, src, size);
    } else {
-      char * const map = intel_upload_map(brw, size, dst_stride);
-      char *dst = map;
-
       while (count--) {
         memcpy(dst, src, dst_stride);
         src += src_stride;
         dst += dst_stride;
       }
-      intel_upload_unmap(brw, map, size, dst_stride,
-                        &buffer->bo, &buffer->offset);
    }
    buffer->stride = dst_stride;
 }
index b27d25fbebfe23d07e37fecb86e5d3cddca7a662..5eaf9dce29f37886780709b4bded46bc2f57606a 100644 (file)
@@ -90,16 +90,17 @@ drm_intel_bo *intel_bufferobj_buffer(struct brw_context *brw,
                                      uint32_t size);
 
 void intel_upload_data(struct brw_context *brw,
-                      const void *ptr, GLuint size, GLuint align,
-                      drm_intel_bo **return_bo,
-                      GLuint *return_offset);
-
-void *intel_upload_map(struct brw_context *brw,
-                      GLuint size, GLuint align);
-void intel_upload_unmap(struct brw_context *brw,
-                       const void *ptr, GLuint size, GLuint align,
-                       drm_intel_bo **return_bo,
-                       GLuint *return_offset);
+                       const void *data,
+                       uint32_t size,
+                       uint32_t alignment,
+                       drm_intel_bo **out_bo,
+                       uint32_t *out_offset);
+
+void *intel_upload_space(struct brw_context *brw,
+                         uint32_t size,
+                         uint32_t alignment,
+                         drm_intel_bo **out_bo,
+                         uint32_t *out_offset);
 
 void intel_upload_finish(struct brw_context *brw);
 
index ec3109bd4411d01bc4c530556cf4918e9d51accb..bb3f615d987587d657b52a7029fe6b051384f6f8 100644 (file)
@@ -57,127 +57,84 @@ intel_upload_finish(struct brw_context *brw)
    if (!brw->upload.bo)
       return;
 
-   if (brw->upload.buffer_len) {
-      drm_intel_bo_subdata(brw->upload.bo,
-                           brw->upload.buffer_offset,
-                           brw->upload.buffer_len,
-                           brw->upload.buffer);
-      brw->upload.buffer_len = 0;
-   }
-
+   drm_intel_bo_unmap(brw->upload.bo);
    drm_intel_bo_unreference(brw->upload.bo);
    brw->upload.bo = NULL;
+   brw->upload.next_offset = 0;
 }
 
-static void
-wrap_buffers(struct brw_context *brw, GLuint size)
-{
-   intel_upload_finish(brw);
-
-   if (size < INTEL_UPLOAD_SIZE)
-      size = INTEL_UPLOAD_SIZE;
-
-   brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "upload", size, 0);
-   brw->upload.offset = 0;
-}
-
-void
-intel_upload_data(struct brw_context *brw,
-                  const void *ptr, GLuint size, GLuint align,
-                  drm_intel_bo **return_bo,
-                  GLuint *return_offset)
-{
-   GLuint base, delta;
-
-   base = ALIGN_NPOT(brw->upload.offset, align);
-   if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) {
-      wrap_buffers(brw, size);
-      base = 0;
-   }
-
-   drm_intel_bo_reference(brw->upload.bo);
-   *return_bo = brw->upload.bo;
-   *return_offset = base;
-
-   delta = base - brw->upload.offset;
-   if (brw->upload.buffer_len &&
-       brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) {
-      drm_intel_bo_subdata(brw->upload.bo,
-                           brw->upload.buffer_offset,
-                           brw->upload.buffer_len,
-                           brw->upload.buffer);
-      brw->upload.buffer_len = 0;
-   }
-
-   if (size < sizeof(brw->upload.buffer)) {
-      if (brw->upload.buffer_len == 0)
-         brw->upload.buffer_offset = base;
-      else
-         brw->upload.buffer_len += delta;
-
-      memcpy(brw->upload.buffer + brw->upload.buffer_len, ptr, size);
-      brw->upload.buffer_len += size;
-   } else {
-      drm_intel_bo_subdata(brw->upload.bo, base, size, ptr);
-   }
-
-   brw->upload.offset = base + size;
-}
-
+/**
+ * Interface for getting memory for uploading streamed data to the GPU
+ *
+ * In most cases, streamed data (for GPU state structures, for example) is
+ * uploaded through brw_state_batch(), since that interface allows relocations
+ * from the streamed space returned to other BOs.  However, that interface has
+ * the restriction that the amount of space allocated has to be "small" (see
+ * estimated_max_prim_size in brw_draw.c).
+ *
+ * This interface, on the other hand, is able to handle arbitrary sized
+ * allocation requests, though it will batch small allocations into the same
+ * BO for efficiency and reduced memory footprint.
+ *
+ * \note The returned pointer is valid only until intel_upload_finish(), which
+ * will happen at batch flush or the next
+ * intel_upload_space()/intel_upload_data().
+ *
+ * \param out_bo Pointer to a BO, which must point to a valid BO or NULL on
+ * entry, and will have a reference to the new BO containing the state on
+ * return.
+ *
+ * \param out_offset Offset within the buffer object that the data will land.
+ */
 void *
-intel_upload_map(struct brw_context *brw, GLuint size, GLuint align)
+intel_upload_space(struct brw_context *brw,
+                   uint32_t size,
+                   uint32_t alignment,
+                   drm_intel_bo **out_bo,
+                   uint32_t *out_offset)
 {
-   GLuint base, delta;
-   char *ptr;
+   uint32_t offset;
 
-   base = ALIGN_NPOT(brw->upload.offset, align);
-   if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) {
-      wrap_buffers(brw, size);
-      base = 0;
+   offset = ALIGN_NPOT(brw->upload.next_offset, alignment);
+   if (brw->upload.bo && offset + size > brw->upload.bo->size) {
+      intel_upload_finish(brw);
+      offset = 0;
    }
 
-   delta = base - brw->upload.offset;
-   if (brw->upload.buffer_len &&
-       brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) {
-      drm_intel_bo_subdata(brw->upload.bo,
-                           brw->upload.buffer_offset,
-                           brw->upload.buffer_len,
-                           brw->upload.buffer);
-      brw->upload.buffer_len = 0;
+   if (!brw->upload.bo) {
+      brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data",
+                                          MAX2(INTEL_UPLOAD_SIZE, size), 4096);
+      if (brw->has_llc)
+         drm_intel_bo_map(brw->upload.bo, true);
+      else
+         drm_intel_gem_bo_map_gtt(brw->upload.bo);
    }
 
-   if (size <= sizeof(brw->upload.buffer)) {
-      if (brw->upload.buffer_len == 0)
-         brw->upload.buffer_offset = base;
-      else
-         brw->upload.buffer_len += delta;
+   brw->upload.next_offset = offset + size;
 
-      ptr = brw->upload.buffer + brw->upload.buffer_len;
-      brw->upload.buffer_len += size;
-   } else {
-      ptr = malloc(size);
+   *out_offset = offset;
+   if (*out_bo != brw->upload.bo) {
+      drm_intel_bo_unreference(*out_bo);
+      *out_bo = brw->upload.bo;
+      drm_intel_bo_reference(brw->upload.bo);
    }
 
-   return ptr;
+   return brw->upload.bo->virtual + offset;
 }
 
+/**
+ * Handy interface to upload some data to temporary GPU memory quickly.
+ *
+ * References to this memory should not be retained across batch flushes.
+ */
 void
-intel_upload_unmap(struct brw_context *brw,
-                   const void *ptr, GLuint size, GLuint align,
-                   drm_intel_bo **return_bo,
-                   GLuint *return_offset)
+intel_upload_data(struct brw_context *brw,
+                  const void *data,
+                  uint32_t size,
+                  uint32_t alignment,
+                  drm_intel_bo **out_bo,
+                  uint32_t *out_offset)
 {
-   GLuint base;
-
-   base = ALIGN_NPOT(brw->upload.offset, align);
-   if (size > sizeof(brw->upload.buffer)) {
-      drm_intel_bo_subdata(brw->upload.bo, base, size, ptr);
-      free((void*)ptr);
-   }
-
-   drm_intel_bo_reference(brw->upload.bo);
-   *return_bo = brw->upload.bo;
-   *return_offset = base;
-
-   brw->upload.offset = base + size;
+   void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset);
+   memcpy(dst, data, size);
 }