i965/vec4: Replace vec4_instruction::regs_written with ::size_written field in bytes.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_context.c
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c

index 13643930e38f3ee12645361b2cd8ddd649711560..3af45551e991ec567bef565a26a51fd2a905ae4b 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -51,6 +51,7 @@
  
  #include "brw_context.h"
  #include "brw_defines.h"
+#include "brw_blorp.h"
  #include "brw_compiler.h"
  #include "brw_draw.h"
  #include "brw_state.h"
@@ -167,6 +168,24 @@ intel_update_framebuffer(struct gl_context *ctx,
                                   fb->DefaultGeometry.NumSamples);
  }
  
+static bool
+intel_disable_rb_aux_buffer(struct brw_context *brw, const drm_intel_bo *bo)
+{
+   const struct gl_framebuffer *fb = brw->ctx.DrawBuffer;
+   bool found = false;
+
+   for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
+      const struct intel_renderbuffer *irb =
+         intel_renderbuffer(fb->_ColorDrawBuffers[i]);
+
+      if (irb && irb->mt->bo == bo) {
+         found = brw->draw_aux_buffer_disabled[i] = true;
+      }
+   }
+
+   return found;
+}
+
  /* On Gen9 color buffers may be compressed by the hardware (lossless
   * compression). There are, however, format restrictions and care needs to be
   * taken that the sampler engine is capable for re-interpreting a buffer with
@@ -196,6 +215,10 @@ intel_texture_view_requires_resolve(struct brw_context *brw,
                _mesa_get_format_name(intel_tex->_Format),
                _mesa_get_format_name(intel_tex->mt->format));
  
+   if (intel_disable_rb_aux_buffer(brw, intel_tex->mt->bo))
+      perf_debug("Sampling renderbuffer with non-compressible format - "
+                 "turning off compression");
+
     return true;
  }
  
@@ -219,6 +242,9 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
     if (depth_irb)
        intel_renderbuffer_resolve_hiz(brw, depth_irb);
  
+   memset(brw->draw_aux_buffer_disabled, 0,
+          sizeof(brw->draw_aux_buffer_disabled));
+
     /* Resolve depth buffer and render cache of each enabled texture. */
     int maxEnabledUnit = ctx->Texture._MaxEnabledTexImageUnit;
     for (int i = 0; i <= maxEnabledUnit; i++) {
@@ -235,6 +261,11 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
                             0 : INTEL_MIPTREE_IGNORE_CCS_E;
        intel_miptree_resolve_color(brw, tex_obj->mt, flags);
        brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
+
+      if (tex_obj->base.StencilSampling ||
+          tex_obj->mt->format == MESA_FORMAT_S_UINT8) {
+         intel_update_r8stencil(brw, tex_obj->mt);
+      }
     }
  
     /* Resolve color for each active shader image. */
@@ -252,17 +283,25 @@ intel_update_state(struct gl_context * ctx, GLuint new_state)
                 /* Access to images is implemented using indirect messages
                  * against data port. Normal render target write understands
                  * lossless compression but unfortunately the typed/untyped
-                * read/write interface doesn't. Therefore the compressed
-                * surfaces need to be resolved prior to accessing them.
+                * read/write interface doesn't. Therefore even lossless
+                * compressed surfaces need to be resolved prior to accessing
+                * them. Hence skip setting INTEL_MIPTREE_IGNORE_CCS_E.
                  */
                 intel_miptree_resolve_color(brw, tex_obj->mt, 0);
+
+               if (intel_miptree_is_lossless_compressed(brw, tex_obj->mt) &&
+                   intel_disable_rb_aux_buffer(brw, tex_obj->mt->bo)) {
+                  perf_debug("Using renderbuffer as shader image - turning "
+                             "off lossless compression");
+               }
+
                 brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
              }
           }
        }
     }
  
-   /* Resolve color buffers for non-coherent framebufer fetch. */
+   /* Resolve color buffers for non-coherent framebuffer fetch. */
     if (!ctx->Extensions.MESA_shader_framebuffer_fetch &&
         ctx->FragmentProgram._Current &&
         ctx->FragmentProgram._Current->Base.OutputsRead) {
@@ -485,16 +524,7 @@ brw_initialize_context_constants(struct brw_context *brw)
     ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
     ctx->Const.MaxRenderbufferSize = 8192;
     ctx->Const.MaxTextureLevels = MIN2(14 /* 8192 */, MAX_TEXTURE_LEVELS);
-
-   /* On Sandy Bridge and prior, the "Render Target View Extent" field of
-    * RENDER_SURFACE_STATE is only 9 bits so the largest 3-D texture we can do
-    * a layered render into has a depth of 512.  On Iron Lake and earlier, we
-    * don't support layered rendering and we use manual offsetting to render
-    * into the different layers so this doesn't matter.  On Sandy Bridge,
-    * however, we do support layered rendering so this is a problem.
-    */
-   ctx->Const.Max3DTextureLevels = brw->gen == 6 ? 10 /* 512 */ : 12; /* 2048 */
-
+   ctx->Const.Max3DTextureLevels = 12; /* 2048 */
     ctx->Const.MaxCubeTextureLevels = 14; /* 8192 */
     ctx->Const.MaxArrayTextureLayers = brw->gen >= 7 ? 2048 : 512;
     ctx->Const.MaxTextureMbytes = 1536;
@@ -766,15 +796,41 @@ brw_initialize_context_constants(struct brw_context *brw)
     ctx->Const.MaxFramebufferHeight = 16384;
     ctx->Const.MaxFramebufferLayers = ctx->Const.MaxArrayTextureLayers;
     ctx->Const.MaxFramebufferSamples = max_samples;
+
+   /* OES_primitive_bounding_box */
+   ctx->Const.NoPrimitiveBoundingBoxOutput = true;
  }
  
  static void
-brw_initialize_cs_context_constants(struct brw_context *brw, unsigned max_threads)
+brw_initialize_cs_context_constants(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
+   const struct intel_screen *screen = brw->intelScreen;
+   const struct gen_device_info *devinfo = screen->devinfo;
+
+   /* FINISHME: Do this for all platforms that the kernel supports */
+   if (brw->is_cherryview &&
+       screen->subslice_total > 0 && screen->eu_total > 0) {
+      /* Logical CS threads = EUs per subslice * 7 threads per EU */
+      brw->max_cs_threads = screen->eu_total / screen->subslice_total * 7;
+
+      /* Fuse configurations may give more threads than expected, never less. */
+      if (brw->max_cs_threads < devinfo->max_cs_threads)
+         brw->max_cs_threads = devinfo->max_cs_threads;
+   } else {
+      brw->max_cs_threads = devinfo->max_cs_threads;
+   }
+
     /* Maximum number of scalar compute shader invocations that can be run in
      * parallel in the same subslice assuming SIMD32 dispatch.
+    *
+    * We don't advertise more than 64 threads, because we are limited to 64 by
+    * our usage of thread_width_max in the gpgpu walker command. This only
+    * currently impacts Haswell, which otherwise might be able to advertise 70
+    * threads. With SIMD32 and 64 threads, Haswell still provides twice the
+    * required the number of invocation needed for ARB_compute_shader.
      */
+   const unsigned max_threads = MIN2(64, brw->max_cs_threads);
     const uint32_t max_invocations = 32 * max_threads;
     ctx->Const.MaxComputeWorkGroupSize[0] = max_invocations;
     ctx->Const.MaxComputeWorkGroupSize[1] = max_invocations;
@@ -864,7 +920,7 @@ brwCreateContext(gl_api api,
     __DRIscreen *sPriv = driContextPriv->driScreenPriv;
     struct gl_context *shareCtx = (struct gl_context *) sharedContextPrivate;
     struct intel_screen *screen = sPriv->driverPrivate;
-   const struct brw_device_info *devinfo = screen->devinfo;
+   const struct gen_device_info *devinfo = screen->devinfo;
     struct dd_function_table functions;
  
     /* Only allow the __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS flag if the kernel
@@ -978,7 +1034,7 @@ brwCreateContext(gl_api api,
     if (INTEL_DEBUG & DEBUG_PERF)
        brw->perf_debug = true;
  
-   brw_initialize_cs_context_constants(brw, devinfo->max_cs_threads);
+   brw_initialize_cs_context_constants(brw);
     brw_initialize_context_constants(brw);
  
     ctx->Const.ResetStrategy = notify_reset
@@ -1020,23 +1076,14 @@ brwCreateContext(gl_api api,
  
     brw_init_surface_formats(brw);
  
+   if (brw->gen >= 6)
+      brw_blorp_init(brw);
+
     brw->max_vs_threads = devinfo->max_vs_threads;
     brw->max_hs_threads = devinfo->max_hs_threads;
     brw->max_ds_threads = devinfo->max_ds_threads;
     brw->max_gs_threads = devinfo->max_gs_threads;
     brw->max_wm_threads = devinfo->max_wm_threads;
-   /* FINISHME: Do this for all platforms that the kernel supports */
-   if (brw->is_cherryview &&
-       screen->subslice_total > 0 && screen->eu_total > 0) {
-      /* Logical CS threads = EUs per subslice * 7 threads per EU */
-      brw->max_cs_threads = screen->eu_total / screen->subslice_total * 7;
-
-      /* Fuse configurations may give more threads than expected, never less. */
-      if (brw->max_cs_threads < devinfo->max_cs_threads)
-         brw->max_cs_threads = devinfo->max_cs_threads;
-   } else {
-      brw->max_cs_threads = devinfo->max_cs_threads;
-   }
     brw->urb.size = devinfo->urb.size;
     brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
     brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
@@ -1114,6 +1161,9 @@ intelDestroyContext(__DRIcontext * driContextPriv)
        brw_destroy_shader_time(brw);
     }
  
+   if (brw->gen >= 6)
+      blorp_finish(&brw->blorp);
+
     brw_destroy_state(brw);
     brw_draw_destroy(brw);