i965: Convert SF_STATE to genxml.
[mesa.git] / src / mesa / drivers / dri / i965 / genX_state_upload.c
index 1041126d37f30e085db0a2625b70677c19ed003c..a5a9d51bde4644892aa1b5e2c6539576b6129ef7 100644 (file)
@@ -154,6 +154,29 @@ vertex_bo(struct brw_bo *bo, uint32_t offset)
    };
 }
 
+#if GEN_GEN == 4
+static inline struct brw_address
+KSP(struct brw_context *brw, uint32_t offset)
+{
+   return instruction_bo(brw->cache.bo, offset);
+}
+
+static inline struct brw_address
+KSP_ro(struct brw_context *brw, uint32_t offset)
+{
+   return instruction_ro_bo(brw->cache.bo, offset);
+}
+#else
+static inline uint32_t
+KSP(struct brw_context *brw, uint32_t offset)
+{
+   return offset;
+}
+
+#define KSP_ro KSP
+
+#endif
+
 #include "genxml/genX_pack.h"
 
 #define _brw_cmd_length(cmd) cmd ## _length
@@ -344,7 +367,9 @@ genX(emit_vertex_buffer_state)(struct brw_context *brw,
 #endif
 #endif
 
-#if GEN_GEN == 9
+#if GEN_GEN == 10
+      .VertexBufferMOCS = CNL_MOCS_WB,
+#elif GEN_GEN == 9
       .VertexBufferMOCS = SKL_MOCS_WB,
 #elif GEN_GEN == 8
       .VertexBufferMOCS = BDW_MOCS_WB,
@@ -1316,7 +1341,7 @@ genX(upload_clip_state)(struct brw_context *brw)
          clip.ClipMode = CLIPMODE_NORMAL;
       }
 
-      clip.ClipEnable = brw->primitive != _3DPRIM_RECTLIST;
+      clip.ClipEnable = true;
 
       /* _NEW_POLYGON,
        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
@@ -1355,7 +1380,6 @@ static const struct brw_tracked_state genX(clip_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
 static void
 genX(upload_sf)(struct brw_context *brw)
 {
@@ -1365,11 +1389,48 @@ genX(upload_sf)(struct brw_context *brw)
 #if GEN_GEN <= 7
    /* _NEW_BUFFERS */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
+   UNUSED const bool multisampled_fbo =
+      _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 #endif
 
+#if GEN_GEN < 6
+   const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
+
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+
+   brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
+      sf.KernelStartPointer = KSP_ro(brw, brw->sf.prog_offset);
+      sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+      sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
+      sf.DispatchGRFStartRegisterForURBData = 3;
+      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+      sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
+      sf.NumberofURBEntries = brw->urb.nr_sf_entries;
+      sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
+
+      /* STATE_PREFETCH command description describes this state as being
+       * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
+       * domain.
+       */
+      sf.SetupViewportStateOffset =
+         instruction_ro_bo(brw->batch.bo, brw->sf.vp_offset);
+
+      sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+
+      /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
+      /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
+
+      sf.MaximumNumberofThreads =
+         MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
+
+      sf.SpritePointEnable = ctx->Point.PointSprite;
+
+      sf.DestinationOriginHorizontalBias = 0.5;
+      sf.DestinationOriginVerticalBias = 0.5;
+#else
    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
       sf.StatisticsEnable = true;
+#endif
       sf.ViewportTransformEnable = true;
 
 #if GEN_GEN == 7
@@ -1380,6 +1441,7 @@ genX(upload_sf)(struct brw_context *brw)
 #if GEN_GEN <= 7
       /* _NEW_POLYGON */
       sf.FrontWinding = ctx->Polygon._FrontBit == render_to_fbo;
+#if GEN_GEN >= 6
       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
@@ -1412,6 +1474,14 @@ genX(upload_sf)(struct brw_context *brw)
             unreachable("not reached");
       }
 
+      if (multisampled_fbo && ctx->Multisample.Enabled)
+         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+
+      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
+      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
+      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
+#endif
+
       sf.ScissorRectangleEnable = true;
 
       if (ctx->Polygon.CullFlag) {
@@ -1436,12 +1506,6 @@ genX(upload_sf)(struct brw_context *brw)
       sf.LineStippleEnable = ctx->Line.StippleFlag;
 #endif
 
-      if (multisampled_fbo && ctx->Multisample.Enabled)
-         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
-
-      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
-      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
-      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
 #endif
 
       /* _NEW_LINE */
@@ -1477,7 +1541,9 @@ genX(upload_sf)(struct brw_context *brw)
          sf.SmoothPointEnable = true;
 #endif
 
+#if GEN_IS_G4X || GEN_GEN >= 5
       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+#endif
 
       /* _NEW_LIGHT */
       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
@@ -1527,14 +1593,21 @@ static const struct brw_tracked_state genX(sf_state) = {
    .dirty = {
       .mesa  = _NEW_LIGHT |
                _NEW_LINE |
-               _NEW_MULTISAMPLE |
                _NEW_POINT |
                _NEW_PROGRAM |
+               (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0),
       .brw   = BRW_NEW_BLORP |
-               BRW_NEW_CONTEXT |
                BRW_NEW_VUE_MAP_GEOM_OUT |
-               (GEN_GEN <= 7 ? BRW_NEW_GS_PROG_DATA |
+               (GEN_GEN <= 5 ? BRW_NEW_BATCH |
+                               BRW_NEW_PROGRAM_CACHE |
+                               BRW_NEW_SF_PROG_DATA |
+                               BRW_NEW_SF_VP |
+                               BRW_NEW_URB_FENCE
+                             : 0) |
+               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
+               (GEN_GEN >= 6 && GEN_GEN <= 7 ?
+                               BRW_NEW_GS_PROG_DATA |
                                BRW_NEW_PRIMITIVE |
                                BRW_NEW_TES_PROG_DATA
                              : 0) |
@@ -1544,7 +1617,6 @@ static const struct brw_tracked_state genX(sf_state) = {
    },
    .emit = genX(upload_sf),
 };
-#endif
 
 /* ---------------------------------------------------------------------- */
 
@@ -1729,20 +1801,6 @@ static const struct brw_tracked_state genX(wm_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN == 4
-static inline struct brw_address
-KSP(struct brw_context *brw, uint32_t offset)
-{
-   return instruction_bo(brw->cache.bo, offset);
-}
-#else
-static inline uint32_t
-KSP(struct brw_context *brw, uint32_t offset)
-{
-   return offset;
-}
-#endif
-
 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
    pkt.SamplerCount       =                                               \
@@ -1952,7 +2010,99 @@ const struct brw_tracked_state genX(cc_vp) = {
 
 /* ---------------------------------------------------------------------- */
 
+static inline void
+set_scissor_bits(const struct gl_context *ctx, int i,
+                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 struct GENX(SCISSOR_RECT) *sc)
+{
+   int bbox[4];
+
+   bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
+   bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
+   bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+   bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
+   _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
+
+   if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
+      /* If the scissor was out of bounds and got clamped to 0 width/height
+       * at the bounds, the subtraction of 1 from maximums could produce a
+       * negative number and thus not clip anything.  Instead, just provide
+       * a min > max scissor inside the bounds, which produces the expected
+       * no rendering.
+       */
+      sc->ScissorRectangleXMin = 1;
+      sc->ScissorRectangleXMax = 0;
+      sc->ScissorRectangleYMin = 1;
+      sc->ScissorRectangleYMax = 0;
+   } else if (render_to_fbo) {
+      /* texmemory: Y=0=bottom */
+      sc->ScissorRectangleXMin = bbox[0];
+      sc->ScissorRectangleXMax = bbox[1] - 1;
+      sc->ScissorRectangleYMin = bbox[2];
+      sc->ScissorRectangleYMax = bbox[3] - 1;
+   } else {
+      /* memory: Y=0=top */
+      sc->ScissorRectangleXMin = bbox[0];
+      sc->ScissorRectangleXMax = bbox[1] - 1;
+      sc->ScissorRectangleYMin = fb_height - bbox[3];
+      sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
+   }
+}
+
 #if GEN_GEN >= 6
+static void
+genX(upload_scissor_state)(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   struct GENX(SCISSOR_RECT) scissor;
+   uint32_t scissor_state_offset;
+   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
+   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
+   uint32_t *scissor_map;
+
+   /* BRW_NEW_VIEWPORT_COUNT */
+   const unsigned viewport_count = brw->clip.viewport_count;
+
+   scissor_map = brw_state_batch(
+      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
+      32, &scissor_state_offset);
+
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
+
+   /* The scissor only needs to handle the intersection of drawable and
+    * scissor rect.  Clipping to the boundaries of static shared buffers
+    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
+    *
+    * Note that the hardware's coordinates are inclusive, while Mesa's min is
+    * inclusive but max is exclusive.
+    */
+   for (unsigned i = 0; i < viewport_count; i++) {
+      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      GENX(SCISSOR_RECT_pack)(
+         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
+   }
+
+   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
+      ptr.ScissorRectPointer = scissor_state_offset;
+   }
+}
+
+static const struct brw_tracked_state genX(scissor_state) = {
+   .dirty = {
+      .mesa = _NEW_BUFFERS |
+              _NEW_SCISSOR |
+              _NEW_VIEWPORT,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_BLORP |
+             BRW_NEW_VIEWPORT_COUNT,
+   },
+   .emit = genX(upload_scissor_state),
+};
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 static void
 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
                              float m00, float m11, float m30, float m31,
@@ -2051,16 +2201,19 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 #define clv sfv
    struct GENX(SF_CLIP_VIEWPORT) sfv;
    uint32_t sf_clip_vp_offset;
-   uint32_t *sf_clip_map = brw_state_batch(brw, 16 * 4 * viewport_count,
-                                           64, &sf_clip_vp_offset);
+   uint32_t *sf_clip_map =
+      brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
+                      64, &sf_clip_vp_offset);
 #else
    struct GENX(SF_VIEWPORT) sfv;
    struct GENX(CLIP_VIEWPORT) clv;
    uint32_t sf_vp_offset, clip_vp_offset;
-   uint32_t *sf_map = brw_state_batch(brw, 8 * 4 * viewport_count,
-                                      32, &sf_vp_offset);
-   uint32_t *clip_map = brw_state_batch(brw, 4 * 4 * viewport_count,
-                                        32, &clip_vp_offset);
+   uint32_t *sf_map =
+      brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
+                      32, &sf_vp_offset);
+   uint32_t *clip_map =
+      brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
+                      32, &clip_vp_offset);
 #endif
 
    /* _NEW_BUFFERS */
@@ -2096,7 +2249,10 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
       clv.YMinClipGuardband = gb_ymin;
       clv.YMaxClipGuardband = gb_ymax;
 
-#if GEN_GEN >= 8
+#if GEN_GEN < 6
+      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+                       &sfv.ScissorRectangle);
+#elif GEN_GEN >= 8
       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
        * The hardware will take the intersection of the drawing rectangle,
        * scissor rectangle, and the viewport extents. We don't need to be
@@ -2122,12 +2278,12 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 
 #if GEN_GEN >= 7
       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
-      sf_clip_map += 16;
+      sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
 #else
       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
-      sf_map += 8;
-      clip_map += 4;
+      sf_map += GENX(SF_VIEWPORT_length);
+      clip_map += GENX(CLIP_VIEWPORT_length);
 #endif
    }
 
@@ -2143,6 +2299,8 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
       vp.PointertoSF_VIEWPORT = sf_vp_offset;
    }
 #else
+   brw->sf.vp_offset = sf_vp_offset;
+   brw->clip.vp_offset = clip_vp_offset;
    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
 #endif
 }
@@ -2150,14 +2308,14 @@ genX(upload_sf_clip_viewport)(struct brw_context *brw)
 static const struct brw_tracked_state genX(sf_clip_viewport) = {
    .dirty = {
       .mesa = _NEW_BUFFERS |
-              _NEW_VIEWPORT,
+              _NEW_VIEWPORT |
+              (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_VIEWPORT_COUNT,
    },
    .emit = genX(upload_sf_clip_viewport),
 };
-#endif
 
 /* ---------------------------------------------------------------------- */
 
@@ -2335,6 +2493,20 @@ static const struct brw_tracked_state genX(gs_state) = {
 
 /* ---------------------------------------------------------------------- */
 
+UNUSED static GLenum
+fix_dual_blend_alpha_to_one(GLenum function)
+{
+   switch (function) {
+   case GL_SRC1_ALPHA:
+      return GL_ONE;
+
+   case GL_ONE_MINUS_SRC1_ALPHA:
+      return GL_ZERO;
+   }
+
+   return function;
+}
+
 #define blend_factor(x) brw_translate_blend_factor(x)
 #define blend_eqn(x) brw_translate_blend_equation(x)
 
@@ -2462,6 +2634,19 @@ genX(upload_blend_state)(struct brw_context *brw)
                dstA = brw_fix_xRGB_alpha(dstA);
             }
 
+            /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
+             * "If Dual Source Blending is enabled, this bit must be disabled."
+             *
+             * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
+             * and leave it enabled anyway.
+             */
+            if (ctx->Color.Blend[i]._UsesDualSrc && blend.AlphaToOneEnable) {
+               srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+               srcA = fix_dual_blend_alpha_to_one(srcA);
+               dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+               dstA = fix_dual_blend_alpha_to_one(dstA);
+            }
+
             entry.ColorBufferBlendEnable = true;
             entry.DestinationBlendFactor = blend_factor(dstRGB);
             entry.SourceBlendFactor = blend_factor(srcRGB);
@@ -2500,16 +2685,6 @@ genX(upload_blend_state)(struct brw_context *brw)
          entry.WriteDisableBlue  = !ctx->Color.ColorMask[i][2];
          entry.WriteDisableAlpha = !ctx->Color.ColorMask[i][3];
 
-         /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
-          * "If Dual Source Blending is enabled, this bit must be disabled."
-          */
-         WARN_ONCE(ctx->Color.Blend[i]._UsesDualSrc &&
-                   _mesa_is_multisample_enabled(ctx) &&
-                   ctx->Multisample.SampleAlphaToOne,
-                   "HW workaround: disabling alpha to one with dual src "
-                   "blending\n");
-         if (ctx->Color.Blend[i]._UsesDualSrc)
-            blend.AlphaToOneEnable = false;
 #if GEN_GEN >= 8
          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
 #else
@@ -2552,92 +2727,6 @@ static const struct brw_tracked_state genX(blend_state) = {
 
 /* ---------------------------------------------------------------------- */
 
-#if GEN_GEN >= 6
-static void
-genX(upload_scissor_state)(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   struct GENX(SCISSOR_RECT) scissor;
-   uint32_t scissor_state_offset;
-   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
-   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
-   uint32_t *scissor_map;
-
-   /* BRW_NEW_VIEWPORT_COUNT */
-   const unsigned viewport_count = brw->clip.viewport_count;
-
-   scissor_map = brw_state_batch(
-      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
-      32, &scissor_state_offset);
-
-   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
-
-   /* The scissor only needs to handle the intersection of drawable and
-    * scissor rect.  Clipping to the boundaries of static shared buffers
-    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
-    *
-    * Note that the hardware's coordinates are inclusive, while Mesa's min is
-    * inclusive but max is exclusive.
-    */
-   for (unsigned i = 0; i < viewport_count; i++) {
-      int bbox[4];
-
-      bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
-      bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
-      bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
-      bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
-      _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
-
-      if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
-         /* If the scissor was out of bounds and got clamped to 0 width/height
-          * at the bounds, the subtraction of 1 from maximums could produce a
-          * negative number and thus not clip anything.  Instead, just provide
-          * a min > max scissor inside the bounds, which produces the expected
-          * no rendering.
-          */
-         scissor.ScissorRectangleXMin = 1;
-         scissor.ScissorRectangleXMax = 0;
-         scissor.ScissorRectangleYMin = 1;
-         scissor.ScissorRectangleYMax = 0;
-      } else if (render_to_fbo) {
-         /* texmemory: Y=0=bottom */
-         scissor.ScissorRectangleXMin = bbox[0];
-         scissor.ScissorRectangleXMax = bbox[1] - 1;
-         scissor.ScissorRectangleYMin = bbox[2];
-         scissor.ScissorRectangleYMax = bbox[3] - 1;
-      } else {
-         /* memory: Y=0=top */
-         scissor.ScissorRectangleXMin = bbox[0];
-         scissor.ScissorRectangleXMax = bbox[1] - 1;
-         scissor.ScissorRectangleYMin = fb_height - bbox[3];
-         scissor.ScissorRectangleYMax = fb_height - bbox[2] - 1;
-      }
-
-      GENX(SCISSOR_RECT_pack)(
-         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
-   }
-
-   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
-      ptr.ScissorRectPointer = scissor_state_offset;
-   }
-}
-
-static const struct brw_tracked_state genX(scissor_state) = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_SCISSOR |
-              _NEW_VIEWPORT,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP |
-             BRW_NEW_VIEWPORT_COUNT,
-   },
-   .emit = genX(upload_scissor_state),
-};
-#endif
-
-/* ---------------------------------------------------------------------- */
-
 #if GEN_GEN >= 7
 UNUSED static const uint32_t push_constant_opcodes[] = {
    [MESA_SHADER_VERTEX]                      = 21,
@@ -2660,14 +2749,12 @@ upload_constant_state(struct brw_context *brw,
       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
       if (active) {
 #if GEN_GEN >= 8 || GEN_IS_HASWELL
-         pkt.ConstantBody.ConstantBuffer2ReadLength =
-            stage_state->push_const_size;
-         pkt.ConstantBody.PointerToConstantBuffer2 =
+         pkt.ConstantBody.ReadLength[2] = stage_state->push_const_size;
+         pkt.ConstantBody.Buffer[2] =
             render_ro_bo(brw->curbe.curbe_bo, stage_state->push_const_offset);
 #else
-         pkt.ConstantBody.ConstantBuffer0ReadLength =
-            stage_state->push_const_size;
-         pkt.ConstantBody.PointerToConstantBuffer0.offset =
+         pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
+         pkt.ConstantBody.Buffer[0].offset =
             stage_state->push_const_offset | mocs;
 #endif
       }
@@ -3064,42 +3151,16 @@ genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
     * command feels strange -- each dword pair contains a SO_DECL per stream.
     */
    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
-      int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
-      struct GENX(SO_DECL) decl = {0};
-      int varying = linked_xfb_info->Outputs[i].OutputRegister;
-      const unsigned components = linked_xfb_info->Outputs[i].NumComponents;
-      unsigned component_mask = (1 << components) - 1;
-      unsigned stream_id = linked_xfb_info->Outputs[i].StreamId;
-      unsigned decl_buffer_slot = buffer;
+      const struct gl_transform_feedback_output *output =
+         &linked_xfb_info->Outputs[i];
+      const int buffer = output->OutputBuffer;
+      const int varying = output->OutputRegister;
+      const unsigned stream_id = output->StreamId;
       assert(stream_id < MAX_VERTEX_STREAMS);
 
-      /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w
-       * gl_Layer is stored in VARYING_SLOT_PSIZ.y
-       * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
-       */
-      if (varying == VARYING_SLOT_PSIZ) {
-         assert(components == 1);
-         component_mask <<= 3;
-      } else if (varying == VARYING_SLOT_LAYER) {
-         assert(components == 1);
-         component_mask <<= 1;
-      } else if (varying == VARYING_SLOT_VIEWPORT) {
-         assert(components == 1);
-         component_mask <<= 2;
-      } else {
-         component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset;
-      }
-
       buffer_mask[stream_id] |= 1 << buffer;
 
-      decl.OutputBufferSlot = decl_buffer_slot;
-      if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) {
-         decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ];
-      } else {
-         assert(vue_map->varying_to_slot[varying] >= 0);
-         decl.RegisterIndex = vue_map->varying_to_slot[varying];
-      }
-      decl.ComponentMask = component_mask;
+      assert(vue_map->varying_to_slot[varying] >= 0);
 
       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
        * array.  Instead, it simply increments DstOffset for the following
@@ -3111,31 +3172,25 @@ genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
        * program as many size = 4 holes as we can, then a final hole to
        * accommodate the final 1, 2, or 3 remaining.
        */
-      int skip_components =
-         linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer];
-
-      next_offset[buffer] += skip_components;
+      int skip_components = output->DstOffset - next_offset[buffer];
 
-      while (skip_components >= 4) {
-         struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
-         d->HoleFlag = 1;
-         d->OutputBufferSlot = decl_buffer_slot;
-         d->ComponentMask = 0xf;
+      while (skip_components > 0) {
+         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+            .HoleFlag = 1,
+            .OutputBufferSlot = output->OutputBuffer,
+            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
+         };
          skip_components -= 4;
       }
 
-      if (skip_components > 0) {
-         struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++];
-         d->HoleFlag = 1;
-         d->OutputBufferSlot = decl_buffer_slot;
-         d->ComponentMask = (1 << skip_components) - 1;
-      }
-
-      assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]);
+      next_offset[buffer] = output->DstOffset + output->NumComponents;
 
-      next_offset[buffer] += components;
-
-      so_decl[stream_id][decls[stream_id]++] = decl;
+      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+         .OutputBufferSlot = output->OutputBuffer,
+         .RegisterIndex = vue_map->varying_to_slot[varying],
+         .ComponentMask =
+            ((1 << output->NumComponents) - 1) << output->ComponentOffset,
+      };
 
       if (decls[stream_id] > max_decls)
          max_decls = decls[stream_id];
@@ -3719,7 +3774,7 @@ genX(upload_cs_state)(struct brw_context *brw)
 
       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
-      vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;;
+      vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
       vfe.ResetGatewayTimer =
          Resettingrelativetimerandlatchingtheglobaltimestamp;
 #if GEN_GEN < 9
@@ -4035,11 +4090,15 @@ genX(upload_ps_blend)(struct brw_context *brw)
       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
 
+      bool alpha_to_one = false;
+
       if (!buffer0_is_integer) {
          /* _NEW_MULTISAMPLE */
-         pb.AlphaToCoverageEnable =
-            _mesa_is_multisample_enabled(ctx) &&
-            ctx->Multisample.SampleAlphaToCoverage;
+
+         if (_mesa_is_multisample_enabled(ctx)) {
+            pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
+            alpha_to_one = ctx->Multisample.SampleAlphaToOne;
+         }
 
          pb.AlphaTestEnable = color->AlphaEnabled;
       }
@@ -4084,6 +4143,16 @@ genX(upload_ps_blend)(struct brw_context *brw)
             dstA = brw_fix_xRGB_alpha(dstA);
          }
 
+         /* Alpha to One doesn't work with Dual Color Blending.  Override
+          * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
+          */
+         if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
+            srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
+            srcA = fix_dual_blend_alpha_to_one(srcA);
+            dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
+            dstA = fix_dual_blend_alpha_to_one(dstA);
+         }
+
          pb.ColorBufferBlendEnable = true;
          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
@@ -4164,8 +4233,8 @@ genX(init_atoms)(struct brw_context *brw)
 
       /* These set up state for brw_psp_urb_cbs */
       &brw_wm_unit,
-      &brw_sf_vp,
-      &brw_sf_unit,
+      &genX(sf_clip_viewport),
+      &genX(sf_state),
       &genX(vs_state), /* always required, enabled or not */
       &brw_clip_unit,
       &brw_gs_unit,