i965/gen9: Optimize slice and subslice load balancing behavior.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_context.c
index 9ed8bc64bb3028ff6a63fbfb7cc5521318c0d314..571618a9cab86bc8ec9df65276e6de79441e2822 100644 (file)
@@ -36,6 +36,7 @@
 #include "main/context.h"
 #include "main/fbobject.h"
 #include "main/extensions.h"
+#include "main/glthread.h"
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/points.h"
@@ -45,6 +46,7 @@
 #include "main/framebuffer.h"
 #include "main/stencil.h"
 #include "main/state.h"
+#include "main/spirv_extensions.h"
 
 #include "vbo/vbo.h"
 
 #include "tnl/t_pipeline.h"
 #include "util/ralloc.h"
 #include "util/debug.h"
+#include "util/disk_cache.h"
 #include "isl/isl.h"
 
+#include "common/gen_defines.h"
+
+#include "compiler/spirv/nir_spirv.h"
 /***************************************
  * Mesa's Driver Functions
  ***************************************/
@@ -143,6 +149,24 @@ intel_get_string(struct gl_context * ctx, GLenum name)
    }
 }
 
+static void
+brw_set_background_context(struct gl_context *ctx,
+                           struct util_queue_monitoring *queue_info)
+{
+   struct brw_context *brw = brw_context(ctx);
+   __DRIcontext *driContext = brw->driContext;
+   __DRIscreen *driScreen = driContext->driScreenPriv;
+   const __DRIbackgroundCallableExtension *backgroundCallable =
+      driScreen->dri2.backgroundCallable;
+
+   /* Note: Mesa will only call this function if we've called
+    * _mesa_enable_multithreading().  We only do that if the loader exposed
+    * the __DRI_BACKGROUND_CALLABLE extension.  So we know that
+    * backgroundCallable is not NULL.
+    */
+   backgroundCallable->setBackgroundContext(driContext->loaderPrivate);
+}
+
 static void
 intel_viewport(struct gl_context *ctx)
 {
@@ -235,6 +259,35 @@ intel_flush_front(struct gl_context *ctx)
    }
 }
 
+static void
+brw_display_shared_buffer(struct brw_context *brw)
+{
+   __DRIcontext *dri_context = brw->driContext;
+   __DRIdrawable *dri_drawable = dri_context->driDrawablePriv;
+   __DRIscreen *dri_screen = brw->screen->driScrnPriv;
+   int fence_fd = -1;
+
+   if (!brw->is_shared_buffer_bound)
+      return;
+
+   if (!brw->is_shared_buffer_dirty)
+      return;
+
+   if (brw->screen->has_exec_fence) {
+      /* This function is always called during a flush operation, so there is
+       * no need to flush again here. But we want to provide a fence_fd to the
+       * loader, and a redundant flush is the easiest way to acquire one.
+       */
+      if (intel_batchbuffer_flush_fence(brw, -1, &fence_fd))
+         return;
+   }
+
+   dri_screen->mutableRenderBuffer.loader
+      ->displaySharedBuffer(dri_drawable, fence_fd,
+                            dri_drawable->loaderPrivate);
+   brw->is_shared_buffer_dirty = false;
+}
+
 static void
 intel_glFlush(struct gl_context *ctx)
 {
@@ -242,7 +295,7 @@ intel_glFlush(struct gl_context *ctx)
 
    intel_batchbuffer_flush(brw);
    intel_flush_front(ctx);
-
+   brw_display_shared_buffer(brw);
    brw->need_flush_throttle = true;
 }
 
@@ -280,6 +333,7 @@ brw_init_driver_functions(struct brw_context *brw,
    functions->GetString = intel_get_string;
    functions->UpdateState = intel_update_state;
 
+   brw_init_draw_functions(functions);
    intelInitTextureFuncs(functions);
    intelInitTextureImageFuncs(functions);
    intelInitTextureCopyImageFuncs(functions);
@@ -302,6 +356,8 @@ brw_init_driver_functions(struct brw_context *brw,
    brw_init_compute_functions(functions);
    brw_init_conditional_render_functions(functions);
 
+   functions->GenerateMipmap = brw_generate_mipmap;
+
    functions->QueryInternalFormat = brw_query_internal_format;
 
    functions->NewTransformFeedback = brw_new_transform_feedback;
@@ -333,9 +389,38 @@ brw_init_driver_functions(struct brw_context *brw,
    /* GL_ARB_get_program_binary */
    brw_program_binary_init(brw->screen->deviceID);
    functions->GetProgramBinaryDriverSHA1 = brw_get_program_binary_driver_sha1;
-   functions->ProgramBinarySerializeDriverBlob = brw_program_serialize_nir;
+   functions->ProgramBinarySerializeDriverBlob = brw_serialize_program_binary;
    functions->ProgramBinaryDeserializeDriverBlob =
       brw_deserialize_program_binary;
+
+   if (brw->screen->disk_cache) {
+      functions->ShaderCacheSerializeDriverBlob = brw_program_serialize_nir;
+   }
+
+   functions->SetBackgroundContext = brw_set_background_context;
+}
+
+static void
+brw_initialize_spirv_supported_capabilities(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct gl_context *ctx = &brw->ctx;
+
+   /* The following SPIR-V capabilities are only supported on gen7+. In theory
+    * you should enable the extension only on gen7+, but just in case let's
+    * assert it.
+    */
+   assert(devinfo->gen >= 7);
+
+   ctx->Const.SpirVCapabilities.atomic_storage = devinfo->gen >= 7;
+   ctx->Const.SpirVCapabilities.draw_parameters = true;
+   ctx->Const.SpirVCapabilities.float64 = devinfo->gen >= 8;
+   ctx->Const.SpirVCapabilities.geometry_streams = devinfo->gen >= 7;
+   ctx->Const.SpirVCapabilities.image_write_without_format = true;
+   ctx->Const.SpirVCapabilities.int64 = devinfo->gen >= 8;
+   ctx->Const.SpirVCapabilities.tessellation = true;
+   ctx->Const.SpirVCapabilities.transform_feedback = devinfo->gen >= 7;
+   ctx->Const.SpirVCapabilities.variable_pointers = true;
 }
 
 static void
@@ -393,11 +478,11 @@ brw_initialize_context_constants(struct brw_context *brw)
    ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
    if (devinfo->gen >= 7) {
       ctx->Const.MaxRenderbufferSize = 16384;
-      ctx->Const.MaxTextureLevels = MIN2(15 /* 16384 */, MAX_TEXTURE_LEVELS);
+      ctx->Const.MaxTextureSize = 16384;
       ctx->Const.MaxCubeTextureLevels = 15; /* 16384 */
    } else {
       ctx->Const.MaxRenderbufferSize = 8192;
-      ctx->Const.MaxTextureLevels = MIN2(14 /* 8192 */, MAX_TEXTURE_LEVELS);
+      ctx->Const.MaxTextureSize = 8192;
       ctx->Const.MaxCubeTextureLevels = 14; /* 8192 */
    }
    ctx->Const.Max3DTextureLevels = 12; /* 2048 */
@@ -536,6 +621,8 @@ brw_initialize_context_constants(struct brw_context *brw)
    if (devinfo->gen >= 5 || devinfo->is_g4x)
       ctx->Const.MaxClipPlanes = 8;
 
+   ctx->Const.GLSLFragCoordIsSysVal = true;
+   ctx->Const.GLSLFrontFacingIsSysVal = true;
    ctx->Const.GLSLTessLevelsAsInputs = true;
    ctx->Const.PrimitiveRestartForPatches = true;
 
@@ -590,7 +677,6 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.QuadsFollowProvokingVertexConvention = false;
 
    ctx->Const.NativeIntegers = true;
-   ctx->Const.VertexID_is_zero_based = true;
 
    /* Regarding the CMP instruction, the Ivybridge PRM says:
     *
@@ -662,7 +748,7 @@ brw_initialize_context_constants(struct brw_context *brw)
    /* ARB_viewport_array, OES_viewport_array */
    if (devinfo->gen >= 6) {
       ctx->Const.MaxViewports = GEN6_NUM_VIEWPORTS;
-      ctx->Const.ViewportSubpixelBits = 0;
+      ctx->Const.ViewportSubpixelBits = 8;
 
       /* Cast to float before negating because MaxViewportWidth is unsigned.
        */
@@ -757,7 +843,8 @@ brw_process_driconf_options(struct brw_context *brw)
 
    driOptionCache *options = &brw->optionCache;
    driParseConfigFiles(options, &brw->screen->optionCache,
-                       brw->driContext->driScreenPriv->myNum, "i965");
+                       brw->driContext->driScreenPriv->myNum,
+                       "i965", NULL);
 
    int bo_reuse_mode = driQueryOptioni(options, "bo_reuse");
    switch (bo_reuse_mode) {
@@ -877,6 +964,7 @@ brwCreateContext(gl_api api,
       *dri_ctx_error = __DRI_CTX_ERROR_NO_MEMORY;
       return false;
    }
+   brw->perf_ctx = gen_perf_new_context(brw);
 
    driContextPriv->driverPrivate = brw;
    brw->driContext = driContextPriv;
@@ -896,21 +984,19 @@ brwCreateContext(gl_api api,
    brw->gs.base.stage = MESA_SHADER_GEOMETRY;
    brw->wm.base.stage = MESA_SHADER_FRAGMENT;
    brw->cs.base.stage = MESA_SHADER_COMPUTE;
-   if (devinfo->gen >= 8) {
-      brw->vtbl.emit_depth_stencil_hiz = gen8_emit_depth_stencil_hiz;
-   } else if (devinfo->gen >= 7) {
-      brw->vtbl.emit_depth_stencil_hiz = gen7_emit_depth_stencil_hiz;
-   } else if (devinfo->gen >= 6) {
-      brw->vtbl.emit_depth_stencil_hiz = gen6_emit_depth_stencil_hiz;
-   } else {
-      brw->vtbl.emit_depth_stencil_hiz = brw_emit_depth_stencil_hiz;
-   }
 
    brw_init_driver_functions(brw, &functions);
 
    if (notify_reset)
       functions.GetGraphicsResetStatus = brw_get_graphics_reset_status;
 
+   brw_process_driconf_options(brw);
+
+   if (api == API_OPENGL_CORE &&
+       driQueryOptionb(&screen->optionCache, "force_compat_profile")) {
+      api = API_OPENGL_COMPAT;
+   }
+
    struct gl_context *ctx = &brw->ctx;
 
    if (!_mesa_initialize_context(ctx, api, mesaVis, shareCtx, &functions)) {
@@ -945,8 +1031,6 @@ brwCreateContext(gl_api api,
 
    _mesa_meta_init(ctx);
 
-   brw_process_driconf_options(brw);
-
    if (INTEL_DEBUG & DEBUG_PERF)
       brw->perf_debug = true;
 
@@ -963,30 +1047,29 @@ brwCreateContext(gl_api api,
 
    intel_batchbuffer_init(brw);
 
-   if (devinfo->gen >= 6) {
-      /* Create a new hardware context.  Using a hardware context means that
-       * our GPU state will be saved/restored on context switch, allowing us
-       * to assume that the GPU is in the same state we left it in.
-       *
-       * This is required for transform feedback buffer offsets, query objects,
-       * and also allows us to reduce how much state we have to emit.
-       */
-      brw->hw_ctx = brw_create_hw_context(brw->bufmgr);
-
-      if (!brw->hw_ctx) {
-         fprintf(stderr, "Failed to create hardware context.\n");
-         intelDestroyContext(driContextPriv);
-         return false;
-      }
+   /* Create a new hardware context.  Using a hardware context means that
+    * our GPU state will be saved/restored on context switch, allowing us
+    * to assume that the GPU is in the same state we left it in.
+    *
+    * This is required for transform feedback buffer offsets, query objects,
+    * and also allows us to reduce how much state we have to emit.
+    */
+   brw->hw_ctx = brw_create_hw_context(brw->bufmgr);
+   if (!brw->hw_ctx && devinfo->gen >= 6) {
+      fprintf(stderr, "Failed to create hardware context.\n");
+      intelDestroyContext(driContextPriv);
+      return false;
+   }
 
-      int hw_priority = BRW_CONTEXT_MEDIUM_PRIORITY;
+   if (brw->hw_ctx) {
+      int hw_priority = GEN_CONTEXT_MEDIUM_PRIORITY;
       if (ctx_config->attribute_mask & __DRIVER_CONTEXT_ATTRIB_PRIORITY) {
          switch (ctx_config->priority) {
          case __DRI_CTX_PRIORITY_LOW:
-            hw_priority = BRW_CONTEXT_LOW_PRIORITY;
+            hw_priority = GEN_CONTEXT_LOW_PRIORITY;
             break;
          case __DRI_CTX_PRIORITY_HIGH:
-            hw_priority = BRW_CONTEXT_HIGH_PRIORITY;
+            hw_priority = GEN_CONTEXT_HIGH_PRIORITY;
             break;
          }
       }
@@ -1006,12 +1089,7 @@ brwCreateContext(gl_api api,
       return false;
    }
 
-   if (devinfo->gen == 10) {
-      fprintf(stderr,
-              "WARNING: i965 does not fully support Gen10 yet.\n"
-              "Instability or lower performance might occur.\n");
-
-   }
+   brw_upload_init(&brw->upload, brw->bufmgr, 65536);
 
    brw_init_state(brw);
 
@@ -1056,6 +1134,18 @@ brwCreateContext(gl_api api,
    _mesa_override_extensions(ctx);
    _mesa_compute_version(ctx);
 
+   /* GL_ARB_gl_spirv */
+   if (ctx->Extensions.ARB_gl_spirv) {
+      brw_initialize_spirv_supported_capabilities(brw);
+
+      if (ctx->Extensions.ARB_spirv_extensions) {
+         /* GL_ARB_spirv_extensions */
+         ctx->Const.SpirVExtensions = MALLOC_STRUCT(spirv_supported_extensions);
+         _mesa_fill_supported_spirv_extensions(ctx->Const.SpirVExtensions,
+                                               &ctx->Const.SpirVCapabilities);
+      }
+   }
+
    _mesa_initialize_dispatch_tables(ctx);
    _mesa_initialize_vbo_vtxfmt(ctx);
 
@@ -1065,7 +1155,13 @@ brwCreateContext(gl_api api,
    vbo_use_buffer_objects(ctx);
    vbo_always_unmap_buffers(ctx);
 
-   brw_disk_cache_init(brw);
+   brw->ctx.Cache = brw->screen->disk_cache;
+
+   if (driContextPriv->driScreenPriv->dri2.backgroundCallable &&
+       driQueryOptionb(&screen->optionCache, "mesa_glthread")) {
+      /* Loader supports multithreading, and so do we. */
+      _mesa_glthread_init(ctx);
+   }
 
    return true;
 }
@@ -1076,7 +1172,18 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    struct brw_context *brw =
       (struct brw_context *) driContextPriv->driverPrivate;
    struct gl_context *ctx = &brw->ctx;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   GET_CURRENT_CONTEXT(curctx);
+
+   if (curctx == NULL) {
+      /* No current context, but we need one to release
+       * renderbuffer surface when we release framebuffer.
+       * So temporarily bind the context.
+       */
+      _mesa_make_current(ctx, NULL, NULL);
+   }
+
+   _mesa_glthread_destroy(&brw->ctx);
 
    _mesa_meta_free(&brw->ctx);
 
@@ -1088,8 +1195,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
       brw_destroy_shader_time(brw);
    }
 
-   if (devinfo->gen >= 6)
-      blorp_finish(&brw->blorp);
+   blorp_finish(&brw->blorp);
 
    brw_destroy_state(brw);
    brw_draw_destroy(brw);
@@ -1130,7 +1236,7 @@ intelDestroyContext(__DRIcontext * driContextPriv)
    driDestroyOptionCache(&brw->optionCache);
 
    /* free the Mesa context */
-   _mesa_free_context_data(&brw->ctx);
+   _mesa_free_context_data(&brw->ctx, true);
 
    ralloc_free(brw);
    driContextPriv->driverPrivate = NULL;
@@ -1139,6 +1245,9 @@ intelDestroyContext(__DRIcontext * driContextPriv)
 GLboolean
 intelUnbindContext(__DRIcontext * driContextPriv)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_glthread_finish(ctx);
+
    /* Unset current context and dispath table */
    _mesa_make_current(NULL, NULL, NULL);
 
@@ -1242,6 +1351,8 @@ intelMakeCurrent(__DRIcontext * driContextPriv,
 
       _mesa_make_current(ctx, fb, readFb);
    } else {
+      GET_CURRENT_CONTEXT(ctx);
+      _mesa_glthread_finish(ctx);
       _mesa_make_current(NULL, NULL, NULL);
    }
 
@@ -1282,6 +1393,21 @@ intel_resolve_for_dri2_flush(struct brw_context *brw,
          intel_miptree_prepare_external(brw, rb->mt);
       } else {
          intel_renderbuffer_downsample(brw, rb);
+
+         /* Call prepare_external on the single-sample miptree to do any
+          * needed resolves prior to handing it off to the window system.
+          * This is needed in the case that rb->singlesample_mt is Y-tiled
+          * with CCS_E enabled but without I915_FORMAT_MOD_Y_TILED_CCS_E.  In
+          * this case, the MSAA resolve above will write compressed data into
+          * rb->singlesample_mt.
+          *
+          * TODO: Some day, if we decide to care about the tiny performance
+          * hit we're taking by doing the MSAA resolve and then a CCS resolve,
+          * we could detect this case and just allocate the single-sampled
+          * miptree without aux.  However, that would be a lot of plumbing and
+          * this is a rather exotic case so it's not really worth it.
+          */
+         intel_miptree_prepare_external(brw, rb->singlesample_mt);
       }
    }
 }
@@ -1417,6 +1543,11 @@ intel_prepare_render(struct brw_context *brw)
     */
    if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer))
       brw->front_buffer_dirty = true;
+
+   if (brw->is_shared_buffer_bound) {
+      /* Subsequent rendering will probably dirty the shared buffer. */
+      brw->is_shared_buffer_dirty = true;
+   }
 }
 
 /**
@@ -1650,12 +1781,25 @@ intel_update_image_buffer(struct brw_context *intel,
    else
       last_mt = rb->singlesample_mt;
 
-   if (last_mt && last_mt->bo == buffer->bo)
+   if (last_mt && last_mt->bo == buffer->bo) {
+      if (buffer_type == __DRI_IMAGE_BUFFER_SHARED) {
+         intel_miptree_make_shareable(intel, last_mt);
+      }
       return;
+   }
+
+   /* Only allow internal compression if samples == 0.  For multisampled
+    * window system buffers, the only thing the single-sampled buffer is used
+    * for is as a resolve target.  If we do any compression beyond what is
+    * supported by the window system, we will just have to resolve so it's
+    * probably better to just not bother.
+    */
+   const bool allow_internal_aux = (num_samples == 0);
 
    struct intel_mipmap_tree *mt =
       intel_miptree_create_for_dri_image(intel, buffer, GL_TEXTURE_2D,
-                                         intel_rb_format(rb), true);
+                                         intel_rb_format(rb),
+                                         allow_internal_aux);
    if (!mt)
       return;
 
@@ -1671,6 +1815,35 @@ intel_update_image_buffer(struct brw_context *intel,
        rb->Base.Base.NumSamples > 1) {
       intel_renderbuffer_upsample(intel, rb);
    }
+
+   if (buffer_type == __DRI_IMAGE_BUFFER_SHARED) {
+      /* The compositor and the application may access this image
+       * concurrently. The display hardware may even scanout the image while
+       * the GPU is rendering to it.  Aux surfaces cause difficulty with
+       * concurrent access, so permanently disable aux for this miptree.
+       *
+       * Perhaps we could improve overall application performance by
+       * re-enabling the aux surface when EGL_RENDER_BUFFER transitions to
+       * EGL_BACK_BUFFER, then disabling it again when EGL_RENDER_BUFFER
+       * returns to EGL_SINGLE_BUFFER. I expect the wins and losses with this
+       * approach to be highly dependent on the application's GL usage.
+       *
+       * I [chadv] expect clever disabling/reenabling to be counterproductive
+       * in the use cases I care about: applications that render nearly
+       * realtime handwriting to the surface while possibly undergiong
+       * simultaneously scanout as a display plane. The app requires low
+       * render latency. Even though the app spends most of its time in
+       * shared-buffer mode, it also frequently transitions between
+       * shared-buffer (EGL_SINGLE_BUFFER) and double-buffer (EGL_BACK_BUFFER)
+       * mode.  Visual sutter during the transitions should be avoided.
+       *
+       * In this case, I [chadv] believe reducing the GPU workload at
+       * shared-buffer/double-buffer transitions would offer a smoother app
+       * experience than any savings due to aux compression. But I've
+       * collected no data to prove my theory.
+       */
+      intel_miptree_make_shareable(intel, mt);
+   }
 }
 
 static void
@@ -1731,4 +1904,19 @@ intel_update_image_buffers(struct brw_context *brw, __DRIdrawable *drawable)
                                 images.back,
                                 __DRI_IMAGE_BUFFER_BACK);
    }
+
+   if (images.image_mask & __DRI_IMAGE_BUFFER_SHARED) {
+      assert(images.image_mask == __DRI_IMAGE_BUFFER_SHARED);
+      drawable->w = images.back->width;
+      drawable->h = images.back->height;
+      intel_update_image_buffer(brw,
+                                drawable,
+                                back_rb,
+                                images.back,
+                                __DRI_IMAGE_BUFFER_SHARED);
+      brw->is_shared_buffer_bound = true;
+   } else {
+      brw->is_shared_buffer_bound = false;
+      brw->is_shared_buffer_dirty = false;
+   }
 }