i965/vec4: Don't lose the force_writemask_all flag during CSE.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_context.c
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c

index 171501f69fe96d4317afd3fe7e3bc1303a695940..ed6fdffd265c8a53d87332d0853e6c98686c8a77 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -40,6 +40,7 @@
  #include "main/points.h"
  #include "main/version.h"
  #include "main/vtxfmt.h"
+#include "main/texobj.h"
  
  #include "vbo/vbo_context.h"
  
@@ -67,6 +68,8 @@
  #include "tnl/t_pipeline.h"
  #include "util/ralloc.h"
  
+#include "glsl/nir/nir.h"
+
  /***************************************
   * Mesa's Driver Functions
   ***************************************/
@@ -80,6 +83,7 @@ brw_query_samples_for_format(struct gl_context *ctx, GLenum target,
     (void) target;
  
     switch (brw->gen) {
+   case 9:
     case 8:
        samples[0] = 8;
        samples[1] = 4;
@@ -96,6 +100,7 @@ brw_query_samples_for_format(struct gl_context *ctx, GLenum target,
        return 1;
  
     default:
+      assert(brw->gen < 6);
        samples[0] = 1;
        return 1;
     }
@@ -123,7 +128,7 @@ brw_get_renderer_string(unsigned deviceID)
  }
  
  static const GLubyte *
-intelGetString(struct gl_context * ctx, GLenum name)
+intel_get_string(struct gl_context * ctx, GLenum name)
  {
     const struct brw_context *const brw = brw_context(ctx);
  
@@ -153,15 +158,39 @@ intel_viewport(struct gl_context *ctx)
  }
  
  static void
-intelInvalidateState(struct gl_context * ctx, GLuint new_state)
+intel_update_state(struct gl_context * ctx, GLuint new_state)
  {
     struct brw_context *brw = brw_context(ctx);
+   struct intel_texture_object *tex_obj;
+   struct intel_renderbuffer *depth_irb;
  
     if (ctx->swrast_context)
        _swrast_InvalidateState(ctx, new_state);
     _vbo_InvalidateState(ctx, new_state);
  
     brw->NewGLState |= new_state;
+
+   _mesa_unlock_context_textures(ctx);
+
+   /* Resolve the depth buffer's HiZ buffer. */
+   depth_irb = intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
+   if (depth_irb)
+      intel_renderbuffer_resolve_hiz(brw, depth_irb);
+
+   /* Resolve depth buffer and render cache of each enabled texture. */
+   int maxEnabledUnit = ctx->Texture._MaxEnabledTexImageUnit;
+   for (int i = 0; i <= maxEnabledUnit; i++) {
+      if (!ctx->Texture.Unit[i]._Current)
+        continue;
+      tex_obj = intel_texture_object(ctx->Texture.Unit[i]._Current);
+      if (!tex_obj || !tex_obj->mt)
+        continue;
+      intel_miptree_all_slices_resolve_depth(brw, tex_obj->mt);
+      intel_miptree_resolve_color(brw, tex_obj->mt);
+      brw_render_cache_set_check_flush(brw, tex_obj->mt->bo);
+   }
+
+   _mesa_lock_context_textures(ctx);
  }
  
  #define flushFront(screen)      ((screen)->image.loader ? (screen)->image.loader->flushFrontBuffer : (screen)->dri2.loader->flushFrontBuffer)
@@ -205,12 +234,12 @@ intel_glFlush(struct gl_context *ctx)
  
     intel_batchbuffer_flush(brw);
     intel_flush_front(ctx);
-   if (brw_is_front_buffer_drawing(ctx->DrawBuffer))
-      brw->need_throttle = true;
+
+   brw->need_flush_throttle = true;
  }
  
-void
-intelFinish(struct gl_context * ctx)
+static void
+intel_finish(struct gl_context * ctx)
  {
     struct brw_context *brw = brw_context(ctx);
  
@@ -237,9 +266,9 @@ brw_init_driver_functions(struct brw_context *brw,
        functions->Viewport = intel_viewport;
  
     functions->Flush = intel_glFlush;
-   functions->Finish = intelFinish;
-   functions->GetString = intelGetString;
-   functions->UpdateState = intelInvalidateState;
+   functions->Finish = intel_finish;
+   functions->GetString = intel_get_string;
+   functions->UpdateState = intel_update_state;
  
     intelInitTextureFuncs(functions);
     intelInitTextureImageFuncs(functions);
@@ -300,7 +329,7 @@ brw_initialize_context_constants(struct brw_context *brw)
        MIN2(ctx->Const.MaxTextureCoordUnits,
             ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits);
     ctx->Const.Program[MESA_SHADER_VERTEX].MaxTextureImageUnits = max_samplers;
-   if (brw->gen >= 7)
+   if (brw->gen >= 6)
        ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits = max_samplers;
     else
        ctx->Const.Program[MESA_SHADER_GEOMETRY].MaxTextureImageUnits = 0;
@@ -381,6 +410,14 @@ brw_initialize_context_constants(struct brw_context *brw)
     ctx->Const.MaxDepthTextureSamples = max_samples;
     ctx->Const.MaxIntegerSamples = max_samples;
  
+   /* gen6_set_sample_maps() sets SampleMap{2,4,8}x variables which are used
+    * to map indices of rectangular grid to sample numbers within a pixel.
+    * These variables are used by GL_EXT_framebuffer_multisample_blit_scaled
+    * extension implementation. For more details see the comment above
+    * gen6_set_sample_maps() definition.
+    */
+   gen6_set_sample_maps(ctx);
+
     if (brw->gen >= 7)
        ctx->Const.MaxProgramTextureGatherComponents = 4;
     else if (brw->gen == 6)
@@ -388,9 +425,19 @@ brw_initialize_context_constants(struct brw_context *brw)
  
     ctx->Const.MinLineWidth = 1.0;
     ctx->Const.MinLineWidthAA = 1.0;
-   ctx->Const.MaxLineWidth = 5.0;
-   ctx->Const.MaxLineWidthAA = 5.0;
-   ctx->Const.LineWidthGranularity = 0.5;
+   if (brw->gen >= 9 || brw->is_cherryview) {
+      ctx->Const.MaxLineWidth = 40.0;
+      ctx->Const.MaxLineWidthAA = 40.0;
+      ctx->Const.LineWidthGranularity = 0.125;
+   } else if (brw->gen >= 6) {
+      ctx->Const.MaxLineWidth = 7.375;
+      ctx->Const.MaxLineWidthAA = 7.375;
+      ctx->Const.LineWidthGranularity = 0.125;
+   } else {
+      ctx->Const.MaxLineWidth = 7.0;
+      ctx->Const.MaxLineWidthAA = 7.0;
+      ctx->Const.LineWidthGranularity = 0.5;
+   }
  
     ctx->Const.MinPointSize = 1.0;
     ctx->Const.MinPointSizeAA = 1.0;
@@ -437,6 +484,12 @@ brw_initialize_context_constants(struct brw_context *brw)
     ctx->Const.Program[MESA_SHADER_FRAGMENT].HighInt = ctx->Const.Program[MESA_SHADER_FRAGMENT].LowInt;
     ctx->Const.Program[MESA_SHADER_FRAGMENT].MediumInt = ctx->Const.Program[MESA_SHADER_FRAGMENT].LowInt;
  
+   ctx->Const.Program[MESA_SHADER_VERTEX].LowInt.RangeMin = 31;
+   ctx->Const.Program[MESA_SHADER_VERTEX].LowInt.RangeMax = 30;
+   ctx->Const.Program[MESA_SHADER_VERTEX].LowInt.Precision = 0;
+   ctx->Const.Program[MESA_SHADER_VERTEX].HighInt = ctx->Const.Program[MESA_SHADER_VERTEX].LowInt;
+   ctx->Const.Program[MESA_SHADER_VERTEX].MediumInt = ctx->Const.Program[MESA_SHADER_VERTEX].LowInt;
+
     if (brw->gen >= 7) {
        ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
        ctx->Const.Program[MESA_SHADER_VERTEX].MaxAtomicCounters = MAX_ATOMIC_COUNTERS;
@@ -458,7 +511,25 @@ brw_initialize_context_constants(struct brw_context *brw)
        ctx->Const.QuadsFollowProvokingVertexConvention = false;
  
     ctx->Const.NativeIntegers = true;
-   ctx->Const.UniformBooleanTrue = 1;
+   ctx->Const.VertexID_is_zero_based = true;
+
+   /* Regarding the CMP instruction, the Ivybridge PRM says:
+    *
+    *   "For each enabled channel 0b or 1b is assigned to the appropriate flag
+    *    bit and 0/all zeros or all ones (e.g, byte 0xFF, word 0xFFFF, DWord
+    *    0xFFFFFFFF) is assigned to dst."
+    *
+    * but PRMs for earlier generations say
+    *
+    *   "In dword format, one GRF may store up to 8 results. When the register
+    *    is used later as a vector of Booleans, as only LSB at each channel
+    *    contains meaning [sic] data, software should make sure all higher bits
+    *    are masked out (e.g. by 'and-ing' an [sic] 0x01 constant)."
+    *
+    * We select the representation of a true boolean uniform to be ~0, and fix
+    * the results of Gen <= 5 CMP instruction's with -(result & 1).
+    */
+   ctx->Const.UniformBooleanTrue = ~0;
  
     /* From the gen4 PRM, volume 4 page 127:
      *
@@ -480,23 +551,45 @@ brw_initialize_context_constants(struct brw_context *brw)
        ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxInputComponents = 128;
     }
  
+   static const nir_shader_compiler_options gen4_nir_options = {
+      .native_integers = true,
+      .lower_ffma = true,
+   };
+
+   static const nir_shader_compiler_options gen6_nir_options = {
+      .native_integers = true,
+   };
+
     /* We want the GLSL compiler to emit code that uses condition codes */
     for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      ctx->ShaderCompilerOptions[i].MaxIfDepth = brw->gen < 6 ? 16 : UINT_MAX;
-      ctx->ShaderCompilerOptions[i].EmitCondCodes = true;
-      ctx->ShaderCompilerOptions[i].EmitNoNoise = true;
-      ctx->ShaderCompilerOptions[i].EmitNoMainReturn = true;
-      ctx->ShaderCompilerOptions[i].EmitNoIndirectInput = true;
-      ctx->ShaderCompilerOptions[i].EmitNoIndirectOutput =
+      ctx->Const.ShaderCompilerOptions[i].MaxIfDepth = brw->gen < 6 ? 16 : UINT_MAX;
+      ctx->Const.ShaderCompilerOptions[i].EmitCondCodes = true;
+      ctx->Const.ShaderCompilerOptions[i].EmitNoNoise = true;
+      ctx->Const.ShaderCompilerOptions[i].EmitNoMainReturn = true;
+      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectInput = true;
+      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectOutput =
          (i == MESA_SHADER_FRAGMENT);
-      ctx->ShaderCompilerOptions[i].EmitNoIndirectTemp =
+      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectTemp =
          (i == MESA_SHADER_FRAGMENT);
-      ctx->ShaderCompilerOptions[i].EmitNoIndirectUniform = false;
-      ctx->ShaderCompilerOptions[i].LowerClipDistance = true;
+      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectUniform = false;
+      ctx->Const.ShaderCompilerOptions[i].LowerClipDistance = true;
+      if (brw->gen >= 6)
+         ctx->Const.ShaderCompilerOptions[i].NirOptions = &gen6_nir_options;
+      else
+         ctx->Const.ShaderCompilerOptions[i].NirOptions = &gen4_nir_options;
     }
  
-   ctx->ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = true;
-   ctx->ShaderCompilerOptions[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
+   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = true;
+   ctx->Const.ShaderCompilerOptions[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
+
+   if (brw->scalar_vs) {
+      /* If we're using the scalar backend for vertex shaders, we need to
+       * configure these accordingly.
+       */
+      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
+      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+   }
  
     /* ARB_viewport_array */
     if (brw->gen >= 7 && ctx->API == API_OPENGL_CORE) {
@@ -561,9 +654,6 @@ brw_process_driconf_options(struct brw_context *brw)
        brw->disable_throttling = true;
     }
  
-   brw->disable_derivative_optimization =
-      driQueryOptionb(&brw->optionCache, "disable_derivative_optimization");
-
     brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile");
  
     ctx->Const.ForceGLSLExtensionsWarn =
@@ -576,6 +666,29 @@ brw_process_driconf_options(struct brw_context *brw)
        driQueryOptionb(options, "allow_glsl_extension_directive_midshader");
  }
  
+/* drop when libdrm 2.4.61 is released */
+#ifndef I915_PARAM_REVISION
+#define I915_PARAM_REVISION 32
+#endif
+
+static int
+brw_get_revision(int fd)
+{
+   struct drm_i915_getparam gp;
+   int revision;
+   int ret;
+
+   memset(&gp, 0, sizeof(gp));
+   gp.param = I915_PARAM_REVISION;
+   gp.value = &revision;
+
+   ret = drmCommandWriteRead(fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+   if (ret)
+      revision = -1;
+
+   return revision;
+}
+
  GLboolean
  brwCreateContext(gl_api api,
                  const struct gl_config *mesaVis,
@@ -634,6 +747,7 @@ brwCreateContext(gl_api api,
     brw->has_negative_rhw_bug = devinfo->has_negative_rhw_bug;
     brw->needs_unlit_centroid_workaround =
        devinfo->needs_unlit_centroid_workaround;
+   brw->revision = brw_get_revision(sPriv->fd);
  
     brw->must_use_separate_stencil = screen->hw_must_use_separate_stencil;
     brw->has_swizzling = screen->hw_has_swizzling;
@@ -647,6 +761,9 @@ brwCreateContext(gl_api api,
     } else if (brw->gen >= 7) {
        gen7_init_vtable_surface_functions(brw);
        brw->vtbl.emit_depth_stencil_hiz = gen7_emit_depth_stencil_hiz;
+   } else if (brw->gen >= 6) {
+      gen6_init_vtable_surface_functions(brw);
+      brw->vtbl.emit_depth_stencil_hiz = gen6_emit_depth_stencil_hiz;
     } else {
        gen4_init_vtable_surface_functions(brw);
        brw->vtbl.emit_depth_stencil_hiz = brw_emit_depth_stencil_hiz;
@@ -693,6 +810,10 @@ brwCreateContext(gl_api api,
  
     brw_process_driconf_options(brw);
     brw_process_intel_debug_variable(brw);
+
+   if (brw->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
+      brw->scalar_vs = true;
+
     brw_initialize_context_constants(brw);
  
     ctx->Const.ResetStrategy = notify_reset
@@ -729,11 +850,15 @@ brwCreateContext(gl_api api,
     brw_init_surface_formats(brw);
  
     brw->max_vs_threads = devinfo->max_vs_threads;
+   brw->max_hs_threads = devinfo->max_hs_threads;
+   brw->max_ds_threads = devinfo->max_ds_threads;
     brw->max_gs_threads = devinfo->max_gs_threads;
     brw->max_wm_threads = devinfo->max_wm_threads;
     brw->urb.size = devinfo->urb.size;
     brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
     brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
+   brw->urb.max_hs_entries = devinfo->urb.max_hs_entries;
+   brw->urb.max_ds_entries = devinfo->urb.max_ds_entries;
     brw->urb.max_gs_entries = devinfo->urb.max_gs_entries;
  
     /* Estimate the size of the mappable aperture into the GTT.  There's an
@@ -752,11 +877,12 @@ brwCreateContext(gl_api api,
     brw->max_gtt_map_object_size = gtt_size / 4;
  
     if (brw->gen == 6)
-      brw->urb.gen6_gs_previously_active = false;
+      brw->urb.gs_present = false;
  
     brw->prim_restart.in_progress = false;
     brw->prim_restart.enable_cut_index = false;
     brw->gs.enabled = false;
+   brw->sf.viewport_transform_enable = true;
  
     ctx->VertexProgram._MaintainTnlProgram = true;
     ctx->FragmentProgram._MaintainTexEnvProgram = true;
@@ -807,6 +933,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
     }
  
     _mesa_meta_free(&brw->ctx);
+   brw_meta_fast_clear_free(brw);
  
     if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
        /* Force a report. */
@@ -820,6 +947,12 @@ intelDestroyContext(__DRIcontext * driContextPriv)
     brw_draw_destroy(brw);
  
     drm_intel_bo_unreference(brw->curbe.curbe_bo);
+   if (brw->vs.base.scratch_bo)
+      drm_intel_bo_unreference(brw->vs.base.scratch_bo);
+   if (brw->gs.base.scratch_bo)
+      drm_intel_bo_unreference(brw->gs.base.scratch_bo);
+   if (brw->wm.base.scratch_bo)
+      drm_intel_bo_unreference(brw->wm.base.scratch_bo);
  
     drm_intel_gem_context_destroy(brw->hw_ctx);
  
@@ -834,8 +967,10 @@ intelDestroyContext(__DRIcontext * driContextPriv)
  
     intel_batchbuffer_free(brw);
  
-   drm_intel_bo_unreference(brw->first_post_swapbuffers_batch);
-   brw->first_post_swapbuffers_batch = NULL;
+   drm_intel_bo_unreference(brw->throttle_batch[1]);
+   drm_intel_bo_unreference(brw->throttle_batch[0]);
+   brw->throttle_batch[1] = NULL;
+   brw->throttle_batch[0] = NULL;
  
     driDestroyOptionCache(&brw->optionCache);
  
@@ -929,13 +1064,17 @@ intelMakeCurrent(__DRIcontext * driContextPriv,
        struct gl_context *ctx = &brw->ctx;
        struct gl_framebuffer *fb, *readFb;
  
-      if (driDrawPriv == NULL && driReadPriv == NULL) {
+      if (driDrawPriv == NULL) {
           fb = _mesa_get_incomplete_framebuffer();
-         readFb = _mesa_get_incomplete_framebuffer();
        } else {
           fb = driDrawPriv->driverPrivate;
-         readFb = driReadPriv->driverPrivate;
           driContextPriv->dri2.draw_stamp = driDrawPriv->dri2.stamp - 1;
+      }
+
+      if (driReadPriv == NULL) {
+         readFb = _mesa_get_incomplete_framebuffer();
+      } else {
+         readFb = driReadPriv->driverPrivate;
           driContextPriv->dri2.read_stamp = driReadPriv->dri2.stamp - 1;
        }
  
@@ -1124,29 +1263,6 @@ intel_prepare_render(struct brw_context *brw)
      */
     if (brw_is_front_buffer_drawing(ctx->DrawBuffer))
        brw->front_buffer_dirty = true;
-
-   /* Wait for the swapbuffers before the one we just emitted, so we
-    * don't get too many swaps outstanding for apps that are GPU-heavy
-    * but not CPU-heavy.
-    *
-    * We're using intelDRI2Flush (called from the loader before
-    * swapbuffer) and glFlush (for front buffer rendering) as the
-    * indicator that a frame is done and then throttle when we get
-    * here as we prepare to render the next frame.  At this point for
-    * round trips for swap/copy and getting new buffers are done and
-    * we'll spend less time waiting on the GPU.
-    *
-    * Unfortunately, we don't have a handle to the batch containing
-    * the swap, and getting our hands on that doesn't seem worth it,
-    * so we just us the first batch we emitted after the last swap.
-    */
-   if (brw->need_throttle && brw->first_post_swapbuffers_batch) {
-      if (!brw->disable_throttling)
-         drm_intel_bo_wait_rendering(brw->first_post_swapbuffers_batch);
-      drm_intel_bo_unreference(brw->first_post_swapbuffers_batch);
-      brw->first_post_swapbuffers_batch = NULL;
-      brw->need_throttle = false;
-   }
  }
  
  /**