util: use C99 declaration in the for-loop hash_table_foreach() macro

[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c

index 6178bfa3f88ae730c689eef33dd6eee47802a919..740cb0c4d2edc41842aa37866795ff7f78ddab7c 100644 (file)
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -59,7 +59,7 @@
  UNUSED static void *
  emit_dwords(struct brw_context *brw, unsigned n)
  {
-   intel_batchbuffer_begin(brw, n, RENDER_RING);
+   intel_batchbuffer_begin(brw, n);
     uint32_t *map = brw->batch.map_next;
     brw->batch.map_next += n;
     intel_batchbuffer_advance(brw);
@@ -217,7 +217,7 @@ genX(upload_polygon_stipple)(struct brw_context *brw)
         * to a FBO (i.e. any named frame buffer object), we *don't*
         * need to invert - we already match the layout.
         */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
           for (unsigned i = 0; i < 32; i++)
              poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
        } else {
@@ -257,7 +257,7 @@ genX(upload_polygon_stipple_offset)(struct brw_context *brw)
         * to a user-created FBO then our native pixel coordinate system
         * works just fine, and there's no window system to worry about.
         */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
           poly.PolygonStippleYOffset =
              (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
        }
@@ -480,6 +480,65 @@ upload_format_size(uint32_t upload_format)
     }
  }
  
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers.  So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround.  Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   bool need_invalidate = false;
+   unsigned i;
+
+   for (i = 0; i < brw->vb.nr_buffers; i++) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   /* Don't bother with draw parameter buffers - those are generated by
+    * the driver so we can select a consistent memory zone.
+    */
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+   }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+   if (high_bits != brw->ib.last_bo_high_bits) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw->ib.last_bo_high_bits = high_bits;
+   }
+#endif
+}
+
  static void
  genX(emit_vertices)(struct brw_context *brw)
  {
@@ -594,6 +653,8 @@ genX(emit_vertices)(struct brw_context *brw)
     const unsigned nr_buffers = brw->vb.nr_buffers +
        uses_draw_params + uses_derived_draw_params;
  
+   vf_invalidate_for_vb_48bit_transitions(brw);
+
     if (nr_buffers) {
        assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
  
@@ -872,6 +933,7 @@ static const struct brw_tracked_state genX(vertices) = {
        .mesa = _NEW_POLYGON,
        .brw = BRW_NEW_BATCH |
               BRW_NEW_BLORP |
+             BRW_NEW_VERTEX_PROGRAM |
               BRW_NEW_VERTICES |
               BRW_NEW_VS_PROG_DATA,
     },
@@ -886,6 +948,8 @@ genX(emit_index_buffer)(struct brw_context *brw)
     if (index_buffer == NULL)
        return;
  
+   vf_invalidate_for_ib_48bit_transition(brw);
+
     brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
  #if GEN_GEN < 8 && !GEN_IS_HASWELL
        ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
@@ -1336,7 +1400,8 @@ genX(upload_clip_state)(struct brw_context *brw)
        clip.ScreenSpaceViewportYMax = 1;
  
        clip.ViewportXYClipTestEnable = true;
-      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+                                       ctx->Transform.DepthClampFar);
  
        /* _NEW_TRANSFORM */
        if (GEN_GEN == 5 || GEN_IS_G4X) {
@@ -1405,7 +1470,7 @@ genX(upload_clip_state)(struct brw_context *brw)
  #endif
  
  #if GEN_GEN == 7
-      clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
+      clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
  
        if (ctx->Polygon.CullFlag) {
           switch (ctx->Polygon.CullFaceMode) {
@@ -1430,7 +1495,8 @@ genX(upload_clip_state)(struct brw_context *brw)
        clip.UserClipDistanceCullTestEnableBitmask =
           brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
  
-      clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
+      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
+                                       ctx->Transform.DepthClampFar);
  #endif
  
        /* _NEW_LIGHT */
@@ -1520,7 +1586,7 @@ genX(upload_sf)(struct brw_context *brw)
  
  #if GEN_GEN <= 7
     /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   bool flip_y = ctx->DrawBuffer->FlipY;
     UNUSED const bool multisampled_fbo =
        _mesa_geometric_samples(ctx->DrawBuffer) > 1;
  #endif
@@ -1572,7 +1638,7 @@ genX(upload_sf)(struct brw_context *brw)
  
  #if GEN_GEN <= 7
        /* _NEW_POLYGON */
-      sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+      sf.FrontWinding = brw->polygon_front_bit != flip_y;
  #if GEN_GEN >= 6
        sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
        sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
@@ -1710,7 +1776,7 @@ genX(upload_sf)(struct brw_context *brw)
         * Window coordinates in an FBO are inverted, which means point
         * sprite origin must be inverted, too.
         */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
           sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
        } else {
           sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -1816,47 +1882,100 @@ genX(upload_wm)(struct brw_context *brw)
           /* Pointer to the WM constant buffer.  Covered by the set of
            * state flags from gen6_upload_wm_push_constants.
            */
-         wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
-         wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
        }
     }
  #endif
  
  #if GEN_GEN >= 6
     brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
-      wm.LineAntialiasingRegionWidth = _10pixels;
-      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
-
-      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
-      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
  #else
     ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
     brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
-      if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
-         /* These two fields should be the same pre-gen6, which is why we
-          * only have one hardware field to program for both dispatch
-          * widths.
-          */
-         assert(wm_prog_data->base.dispatch_grf_start_reg ==
-                wm_prog_data->dispatch_grf_start_reg_2);
+#endif
+
+#if GEN_GEN <= 6
+      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+
+#if GEN_GEN == 4
+      /* On gen4, we only have one shader kernel */
+      if (brw_wm_state_has_ksp(wm, 0)) {
+         assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
+         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
        }
+#elif GEN_GEN == 5
+      /* On gen5, we have multiple shader kernels but only one GRF start
+       * register for all kernels
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
  
-      if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
-         wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
+      wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+      wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+      wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
  
-      if (stage_state->sampler_count)
-         wm.SamplerStatePointer =
-            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
-#if GEN_GEN == 5
-      if (wm_prog_data->prog_offset_2)
-         wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         wm_prog_data->base.dispatch_grf_start_reg;
+
+      /* Dispatch GRF Start should be the same for all shaders on gen5 */
+      if (brw_wm_state_has_ksp(wm, 1)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
+      }
+      if (brw_wm_state_has_ksp(wm, 2)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
+      }
+#elif GEN_GEN == 6
+      /* On gen6, we have multiple shader kernels and we no longer specify a
+       * register count for each one.
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+      wm.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+      wm.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
  #endif
  
-      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+#if GEN_GEN <= 5
        wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
        /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
        wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
+      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+      wm.SetupURBEntryReadOffset = 0;
        wm.EarlyDepthTestEnable = true;
+#endif
+
+#if GEN_GEN >= 6
+      wm.LineAntialiasingRegionWidth = _10pixels;
+      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+
+      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#else
+      if (stage_state->sampler_count)
+         wm.SamplerStatePointer =
+            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
+
        wm.LineAntialiasingRegionWidth = _05pixels;
        wm.LineEndCapAntialiasingRegionWidth = _10pixels;
  
@@ -1891,21 +2010,6 @@ genX(upload_wm)(struct brw_context *brw)
        wm.BindingTableEntryCount =
           wm_prog_data->base.binding_table.size_bytes / 4;
        wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
-      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
-      wm.DispatchGRFStartRegisterForConstantSetupData0 =
-         wm_prog_data->base.dispatch_grf_start_reg;
-      if (GEN_GEN == 6 ||
-          wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
-         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
-      }
-
-#if GEN_GEN >= 5
-      if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
-         wm.KernelStartPointer2 =
-            KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
-      }
-#endif
  
  #if GEN_GEN == 6
        wm.DualSourceBlendEnable =
@@ -1930,9 +2034,6 @@ genX(upload_wm)(struct brw_context *brw)
           wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
        else
           wm.PositionXYOffsetSelect = POSOFFSET_NONE;
-
-      wm.DispatchGRFStartRegisterForConstantSetupData2 =
-         wm_prog_data->dispatch_grf_start_reg_2;
  #endif
  
        if (wm_prog_data->base.total_scratch) {
@@ -2067,7 +2168,13 @@ static const struct brw_tracked_state genX(wm_state) = {
     pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
     pkt.SamplerCount       =                                               \
        DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
+    * disable prefetching of binding tables in A0 and B0 steppings.       \
+    * TODO: Revisit this WA on C0 stepping.                               \
+    */                                                                    \
     pkt.BindingTableEntryCount =                                           \
+      GEN_GEN == 11 ?                                                     \
+      0 :                                                                 \
        stage_prog_data->binding_table.size_bytes / 4;                      \
     pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                            \
@@ -2117,8 +2224,8 @@ genX(upload_vs_state)(struct brw_context *brw)
     brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
        if (stage_state->push_const_size != 0) {
           cvs.Buffer0Valid = true;
-         cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
-         cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
        }
     }
  #endif
@@ -2234,8 +2341,14 @@ genX(upload_cc_viewport)(struct brw_context *brw)
     for (unsigned i = 0; i < viewport_count; i++) {
        /* _NEW_VIEWPORT | _NEW_TRANSFORM */
        const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
-      if (ctx->Transform.DepthClamp) {
+      if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
+         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
+      } else if (ctx->Transform.DepthClampNear) {
           ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
+         ccv.MaximumDepth = 0.0;
+      } else if (ctx->Transform.DepthClampFar) {
+         ccv.MinimumDepth = 0.0;
           ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
        } else {
           ccv.MinimumDepth = 0.0;
@@ -2275,7 +2388,7 @@ const struct brw_tracked_state genX(cc_vp) = {
  
  static void
  set_scissor_bits(const struct gl_context *ctx, int i,
-                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 bool flip_y, unsigned fb_width, unsigned fb_height,
                   struct GENX(SCISSOR_RECT) *sc)
  {
     int bbox[4];
@@ -2297,7 +2410,7 @@ set_scissor_bits(const struct gl_context *ctx, int i,
        sc->ScissorRectangleXMax = 0;
        sc->ScissorRectangleYMin = 1;
        sc->ScissorRectangleYMax = 0;
-   } else if (render_to_fbo) {
+   } else if (!flip_y) {
        /* texmemory: Y=0=bottom */
        sc->ScissorRectangleXMin = bbox[0];
        sc->ScissorRectangleXMax = bbox[1] - 1;
@@ -2317,7 +2430,7 @@ static void
  genX(upload_scissor_state)(struct brw_context *brw)
  {
     struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
     struct GENX(SCISSOR_RECT) scissor;
     uint32_t scissor_state_offset;
     const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
@@ -2341,7 +2454,7 @@ genX(upload_scissor_state)(struct brw_context *brw)
      * inclusive but max is exclusive.
      */
     for (unsigned i = 0; i < viewport_count; i++) {
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
        GENX(SCISSOR_RECT_pack)(
           NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
     }
@@ -2410,6 +2523,17 @@ brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
      */
     const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
  
+   /* Workaround: prevent gpu hangs on SandyBridge
+    * by disabling guardband clipping for odd dimensions.
+    */
+   if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
+      *xmin = -1.0f;
+      *xmax =  1.0f;
+      *ymin = -1.0f;
+      *ymax =  1.0f;
+      return;
+   }
+
     if (m00 != 0 && m11 != 0) {
        /* First, we compute the screen-space render area */
        const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
@@ -2456,7 +2580,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
     const unsigned viewport_count = brw->clip.viewport_count;
  
     /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
     const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
     const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
  
@@ -2480,12 +2604,12 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
  #endif
  
     /* _NEW_BUFFERS */
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   } else {
+   if (flip_y) {
        y_scale = -1.0;
        y_bias = (float)fb_height;
+   } else {
+      y_scale = 1.0;
+      y_bias = 0;
     }
  
     for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
@@ -2513,7 +2637,7 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
        clv.YMaxClipGuardband = gb_ymax;
  
  #if GEN_GEN < 6
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
                         &sfv.ScissorRectangle);
  #elif GEN_GEN >= 8
        /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
@@ -2530,16 +2654,16 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
        const float viewport_Ymax =
           MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
  
-      if (render_to_fbo) {
+      if (flip_y) {
           sfv.XMinViewPort = viewport_Xmin;
           sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = viewport_Ymin;
-         sfv.YMaxViewPort = viewport_Ymax - 1;
+         sfv.YMinViewPort = fb_height - viewport_Ymax;
+         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
        } else {
           sfv.XMinViewPort = viewport_Xmin;
           sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = fb_height - viewport_Ymax;
-         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
+         sfv.YMinViewPort = viewport_Ymin;
+         sfv.YMaxViewPort = viewport_Ymax - 1;
        }
  #endif
  
@@ -2609,8 +2733,8 @@ genX(upload_gs_state)(struct brw_context *brw)
     brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
        if (active && stage_state->push_const_size != 0) {
           cgs.Buffer0Valid = true;
-         cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
-         cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
        }
     }
  #endif
@@ -2691,7 +2815,7 @@ genX(upload_gs_state)(struct brw_context *brw)
  #if GEN_GEN < 7
           gs.SOStatisticsEnable = true;
           if (gs_prog->info.has_transform_feedback_varyings)
-            gs.SVBIPayloadEnable = true;
+            gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
  
           /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
            * was previously done for gen6.
@@ -3470,14 +3594,14 @@ genX(upload_sbe)(struct brw_context *brw)
        sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
  
        /* _NEW_BUFFERS */
-      bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+      bool flip_y = ctx->DrawBuffer->FlipY;
  
        /* _NEW_POINT
         *
         * Window coordinates in an FBO are inverted, which means point
         * sprite origin must be inverted.
         */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
           sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
        else
           sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -3672,19 +3796,20 @@ genX(upload_3dstate_so_buffers)(struct brw_context *brw)
     for (int i = 0; i < 4; i++) {
        struct intel_buffer_object *bufferobj =
           intel_buffer_object(xfb_obj->Buffers[i]);
+      uint32_t start = xfb_obj->Offset[i];
+      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
+      uint32_t const size = end - start;
  
-      if (!bufferobj) {
+      if (!bufferobj || !size) {
           brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
              sob.SOBufferIndex = i;
           }
           continue;
        }
  
-      uint32_t start = xfb_obj->Offset[i];
        assert(start % 4 == 0);
-      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
        struct brw_bo *bo =
-         intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
+         intel_bufferobj_buffer(brw, bufferobj, start, size, true);
        assert(end <= bo->size);
  
        brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
@@ -3856,7 +3981,13 @@ genX(upload_ps)(struct brw_context *brw)
           DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
  
        /* BRW_NEW_FS_PROG_DATA */
-      ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
+      /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
+       * prefetching of binding tables in A0 and B0 steppings.
+       * TODO: Revisit this workaround on C0 stepping.
+       */
+      ps.BindingTableEntryCount = GEN_GEN == 11 ?
+                                  0 :
+                                  prog_data->base.binding_table.size_bytes / 4;
  
        if (prog_data->base.use_alt_mode)
           ps.FloatingPointMode = Alternate;
@@ -3932,14 +4063,37 @@ genX(upload_ps)(struct brw_context *brw)
  
        ps._8PixelDispatchEnable = prog_data->dispatch_8;
        ps._16PixelDispatchEnable = prog_data->dispatch_16;
+      ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+       *
+       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+       *
+       * Since 16x MSAA is first introduced on SKL, we don't need to apply
+       * the workaround on any older hardware.
+       *
+       * BRW_NEW_NUM_SAMPLES
+       */
+      if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+          brw->num_samples == 16) {
+         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+         ps._32PixelDispatchEnable = false;
+      }
+
        ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->base.dispatch_grf_start_reg;
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
        ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->dispatch_grf_start_reg_2;
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
  
-      ps.KernelStartPointer0 = stage_state->prog_offset;
+      ps.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 1);
        ps.KernelStartPointer2 = stage_state->prog_offset +
-         prog_data->prog_offset_2;
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 2);
  
        if (prog_data->base.total_scratch) {
           ps.ScratchSpaceBasePointer =
@@ -3957,7 +4111,8 @@ static const struct brw_tracked_state genX(ps_state) = {
                              : 0),
        .brw   = BRW_NEW_BATCH |
                 BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
+               BRW_NEW_FS_PROG_DATA |
+               (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
     },
     .emit = genX(upload_ps),
  };
@@ -4375,7 +4530,7 @@ genX(upload_raster)(struct brw_context *brw)
     const struct gl_context *ctx = &brw->ctx;
  
     /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
  
     /* _NEW_POLYGON */
     const struct gl_polygon_attrib *polygon = &ctx->Polygon;
@@ -4384,7 +4539,7 @@ genX(upload_raster)(struct brw_context *brw)
     const struct gl_point_attrib *point = &ctx->Point;
  
     brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
-      if (brw->polygon_front_bit == render_to_fbo)
+      if (brw->polygon_front_bit != flip_y)
           raster.FrontWinding = CounterClockwise;
  
        if (polygon->CullFlag) {
@@ -4459,14 +4614,19 @@ genX(upload_raster)(struct brw_context *brw)
        raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
  
        /* _NEW_TRANSFORM */
-      if (!ctx->Transform.DepthClamp) {
+#if GEN_GEN < 9
+      if (!(ctx->Transform.DepthClampNear &&
+            ctx->Transform.DepthClampFar))
+         raster.ViewportZClipTestEnable = true;
+#endif
+
  #if GEN_GEN >= 9
-         raster.ViewportZFarClipTestEnable = true;
+      if (!ctx->Transform.DepthClampNear)
           raster.ViewportZNearClipTestEnable = true;
-#else
-         raster.ViewportZClipTestEnable = true;
+
+      if (!ctx->Transform.DepthClampFar)
+         raster.ViewportZFarClipTestEnable = true;
  #endif
-      }
  
        /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
  #if GEN_GEN >= 9