llvmpipe: reduce alignment requirement for resources from 64x64 to 4x4
authorRoland Scheidegger <sroland@vmware.com>
Thu, 30 May 2013 00:05:01 +0000 (02:05 +0200)
committerRoland Scheidegger <sroland@vmware.com>
Fri, 31 May 2013 18:21:05 +0000 (20:21 +0200)
The overallocation was very bad especially for things like 1d array
textures which got blown up by a factor of 64. (Even ordinary smallish
2d textures benefit a lot from this, a mipmapped 64x64 rgba8 texture
previously used 7*16kB = 112kB instead of now ~22kB.)
4x4 is chosen because this is the size the jit functions run on, so
making it smaller is going to be a bit more complicated.
It is actually not strictly 4x4 pixel, since we'd want to avoid situations
where different threads are rendering to the same cacheline so we keep
cacheline size alignment in x direction (often 64bytes).
To make this work introduce new task width/height parameters and make
sure clears don't clear the whole tile if it's a partial tile. Likewise,
the rasterizer may produce fragments outside the 4x4 blocks present in a
tile, so don't call the jit function for them.
This does not yet fix rendering to buffers (which cannot have any y
alignment at all), and 1d/1d array textures are still overallocated by a
factor of 4.

v2: replace magic number 4 with LP_RASTER_BLOCK_SIZE, fix size of buffers
allocated (needed in case we render to them).

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
src/gallium/drivers/llvmpipe/lp_rast.c
src/gallium/drivers/llvmpipe/lp_rast.h
src/gallium/drivers/llvmpipe/lp_rast_priv.h
src/gallium/drivers/llvmpipe/lp_scene.c
src/gallium/drivers/llvmpipe/lp_scene.h
src/gallium/drivers/llvmpipe/lp_setup.c
src/gallium/drivers/llvmpipe/lp_texture.c

index 5c837a043e0d134e6fcc92e918379e7ab378ed0b..be5a286e3daa94267f40bc68b3f21455c3f57720 100644 (file)
@@ -95,6 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
    task->bin = bin;
    task->x = x * TILE_SIZE;
    task->y = y * TILE_SIZE;
+   task->width = TILE_SIZE + x * TILE_SIZE > task->scene->width_aligned ?
+                    task->scene->width_aligned - x * TILE_SIZE : TILE_SIZE;
+   task->height = TILE_SIZE + y * TILE_SIZE > task->scene->height_aligned ?
+                    task->scene->height_aligned - y * TILE_SIZE : TILE_SIZE;
 
    /* reset pointers to color and depth tile(s) */
    memset(task->color_tiles, 0, sizeof(task->color_tiles));
@@ -144,8 +148,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
                            scene->cbufs[i].stride,
                            task->x,
                            task->y,
-                           TILE_SIZE,
-                           TILE_SIZE,
+                           task->width,
+                           task->height,
                            &uc);
          }
       }
@@ -172,8 +176,8 @@ lp_rast_clear_color(struct lp_rasterizer_task *task,
                            scene->cbufs[i].stride,
                            task->x,
                            task->y,
-                           TILE_SIZE,
-                           TILE_SIZE,
+                           task->width,
+                           task->height,
                            &uc);
          }
       }
@@ -198,8 +202,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
    uint64_t clear_mask64 = arg.clear_zstencil.mask;
    uint32_t clear_value = (uint32_t) clear_value64;
    uint32_t clear_mask = (uint32_t) clear_mask64;
-   const unsigned height = TILE_SIZE;
-   const unsigned width = TILE_SIZE;
+   const unsigned height = task->height;
+   const unsigned width = task->width;
    const unsigned block_size = scene->zsbuf.blocksize;
    const unsigned dst_stride = scene->zsbuf.stride;
    uint8_t *dst;
@@ -325,8 +329,8 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
    variant = state->variant;
 
    /* render the whole 64x64 tile in 4x4 chunks */
-   for (y = 0; y < TILE_SIZE; y += 4){
-      for (x = 0; x < TILE_SIZE; x += 4) {
+   for (y = 0; y < task->height; y += 4){
+      for (x = 0; x < task->width; x += 4) {
          uint8_t *color[PIPE_MAX_COLOR_BUFS];
          unsigned stride[PIPE_MAX_COLOR_BUFS];
          uint8_t *depth = NULL;
@@ -434,21 +438,27 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
 
    assert(lp_check_alignment(state->jit_context.u8_blend_color, 16));
 
-   /* run shader on 4x4 block */
-   BEGIN_JIT_CALL(state, task);
-   variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
-                                         x, y,
-                                         inputs->frontfacing,
-                                         GET_A0(inputs),
-                                         GET_DADX(inputs),
-                                         GET_DADY(inputs),
-                                         color,
-                                         depth,
-                                         mask,
-                                         &task->thread_data,
-                                         stride,
-                                         depth_stride);
-   END_JIT_CALL();
+   /*
+    * The rasterizer may produce fragments outside our
+    * allocated 4x4 blocks hence need to filter them out here.
+    */
+   if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
+      /* run shader on 4x4 block */
+      BEGIN_JIT_CALL(state, task);
+      variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
+                                            x, y,
+                                            inputs->frontfacing,
+                                            GET_A0(inputs),
+                                            GET_DADX(inputs),
+                                            GET_DADY(inputs),
+                                            color,
+                                            depth,
+                                            mask,
+                                            &task->thread_data,
+                                            stride,
+                                            depth_stride);
+      END_JIT_CALL();
+   }
 }
 
 
index 8dd3615e78adf88ff74b381f2356f575db9f2eb3..9fe89e5b6f1f7e16130ca94b694cdb366572a154 100644 (file)
@@ -50,6 +50,9 @@ struct cmd_bin;
 #define FIXED_ORDER 4
 #define FIXED_ONE (1<<FIXED_ORDER)
 
+/* Rasterizer output size going to jit fs, width/height */
+#define LP_RASTER_BLOCK_SIZE 4
+
 
 struct lp_rasterizer_task;
 
index e4b6e5b301fd9c259c5c35e93421b5577bbeb3b1..4876d7472fb04c4d3ce690917b81627439317461 100644 (file)
@@ -86,6 +86,7 @@ struct lp_rasterizer_task
 
    struct lp_scene *scene;
    unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
+   unsigned width, height; /**< width, height of current tile, in pixels */
 
    uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS];
    uint8_t *depth_tile;
@@ -293,21 +294,27 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
       depth_stride = scene->zsbuf.stride;
    }
 
-   /* run shader on 4x4 block */
-   BEGIN_JIT_CALL(state, task);
-   variant->jit_function[RAST_WHOLE]( &state->jit_context,
-                                      x, y,
-                                      inputs->frontfacing,
-                                      GET_A0(inputs),
-                                      GET_DADX(inputs),
-                                      GET_DADY(inputs),
-                                      color,
-                                      depth,
-                                      0xffff,
-                                      &task->thread_data,
-                                      stride,
-                                      depth_stride);
-   END_JIT_CALL();
+   /*
+    * The rasterizer may produce fragments outside our
+    * allocated 4x4 blocks hence need to filter them out here.
+    */
+   if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
+      /* run shader on 4x4 block */
+      BEGIN_JIT_CALL(state, task);
+      variant->jit_function[RAST_WHOLE]( &state->jit_context,
+                                         x, y,
+                                         inputs->frontfacing,
+                                         GET_A0(inputs),
+                                         GET_DADX(inputs),
+                                         GET_DADY(inputs),
+                                         color,
+                                         depth,
+                                         0xffff,
+                                         &task->thread_data,
+                                         stride,
+                                         depth_stride);
+      END_JIT_CALL();
+   }
 }
 
 void lp_rast_triangle_1( struct lp_rasterizer_task *, 
index 771ad085a120138a03b639dbd483cdb01ba3c2d8..2dfc7ff9ce7e078337c853d1aa8eb2b9115f6f59 100644 (file)
@@ -505,6 +505,8 @@ void lp_scene_begin_binning( struct lp_scene *scene,
 
    scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
    scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
+   scene->width_aligned = align(fb->width, LP_RASTER_BLOCK_SIZE);
+   scene->height_aligned = align(fb->height, LP_RASTER_BLOCK_SIZE);
 
    assert(scene->tiles_x <= TILES_X);
    assert(scene->tiles_y <= TILES_Y);
index fa5bbcaf013a518b7c4a686524aa3bc9b3d8805d..bc6c448bc7f00223baa3ec2457d719f9636bbe9e 100644 (file)
@@ -144,6 +144,10 @@ struct lp_scene {
    /** list of resources referenced by the scene commands */
    struct resource_ref *resources;
 
+   /** aligned scene width, height */
+   unsigned width_aligned;
+   unsigned height_aligned;
+
    /** Total memory used by the scene (in bytes).  This sums all the
     * data blocks and counts all bins, state, resource references and
     * other random allocations within the scene.
index a141fa337ab176d7d0067766252d4dd85e9e5d3e..bafcf56b8033265ef51397863174eeec061d91e7 100644 (file)
@@ -694,8 +694,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                assert(last_level <= res->last_level);
 
                /*
-                * The complexity here is only necessary for depth textures which
-                * still are tiled.
+                * The complexity here should no longer be necessary.
                 */
                mip_ptr = llvmpipe_get_texture_image_all(lp_tex, first_level,
                                                         LP_TEX_USAGE_READ);
index 0ac3528f7accd7ca5add50498785290a05dcd30d..56eb4999a1cc293c44c5759373d7b3205bf6137b 100644 (file)
@@ -49,6 +49,7 @@
 #include "lp_texture.h"
 #include "lp_setup.h"
 #include "lp_state.h"
+#include "lp_rast.h"
 
 #include "state_tracker/sw_winsys.h"
 
@@ -84,15 +85,15 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
       {
          unsigned alignment, nblocksx, nblocksy, block_size;
 
-         /* For non-compressed formats we need to align the texture size
-          * to the tile size to facilitate render-to-texture.
-          * XXX this blows up 1d/1d array textures by unreasonable
-          * amount (factor 64), probably should do something about it.
+         /* For non-compressed formats we need 4x4 pixel alignment
+          * (for now). We also want cache line size in x direction,
+          * otherwise same cache line could end up in multiple threads.
+          * XXX this blows up 1d/1d array textures by a factor of 4.
           */
          if (util_format_is_compressed(pt->format))
             alignment = 1;
          else
-            alignment = TILE_SIZE;
+            alignment = LP_RASTER_BLOCK_SIZE;
 
          nblocksx = util_format_get_nblocksx(pt->format,
                                              align(width, alignment));
@@ -100,7 +101,10 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
                                              align(height, alignment));
          block_size = util_format_get_blocksize(pt->format);
 
-         lpr->row_stride[level] = align(nblocksx * block_size, 16);
+         if (util_format_is_compressed(pt->format))
+            lpr->row_stride[level] = nblocksx * block_size;
+         else
+            lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
 
          /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
          if (lpr->row_stride[level] > LP_MAX_TEXTURE_SIZE / nblocksy) {
@@ -244,7 +248,12 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
       assert(templat->height0 == 1);
       assert(templat->depth0 == 1);
       assert(templat->last_level == 0);
-      lpr->data = align_malloc(bytes, 16);
+      /*
+       * Reserve some extra storage since if we'd render to a buffer we
+       * read/write always LP_RASTER_BLOCK_SIZE pixels, but the element
+       * offset doesn't need to be aligned to LP_RASTER_BLOCK_SIZE.
+       */
+      lpr->data = align_malloc(bytes + (LP_RASTER_BLOCK_SIZE - 1) * 4 * sizeof(float), 16);
       /*
        * buffers don't really have stride but it's probably safer
        * (for code doing same calculations for buffers and textures)
@@ -327,7 +336,6 @@ llvmpipe_resource_map(struct pipe_resource *resource,
       struct llvmpipe_screen *screen = llvmpipe_screen(resource->screen);
       struct sw_winsys *winsys = screen->winsys;
       unsigned dt_usage;
-      uint8_t *map2;
 
       if (tex_usage == LP_TEX_USAGE_READ) {
          dt_usage = PIPE_TRANSFER_READ;
@@ -345,14 +353,11 @@ llvmpipe_resource_map(struct pipe_resource *resource,
       /* install this linear image in texture data structure */
       lpr->linear_img.data = map;
 
-      /* make sure tiled data gets converted to linear data */
-      map2 = llvmpipe_get_texture_image(lpr, 0, 0, tex_usage);
-      return map2;
+      return map;
    }
    else if (llvmpipe_resource_is_texture(resource)) {
 
-      map = llvmpipe_get_texture_image(lpr, layer, level,
-                                       tex_usage);
+      map = llvmpipe_get_texture_image(lpr, layer, level, tex_usage);
       return map;
    }
    else {