radeonsi: print shader cache stats with AMD_DEBUG=cache_stats
[mesa.git] / src / gallium / drivers / radeonsi / si_state_viewport.c
index d0287d5ad754e86d5052964c782f8b2f2993f426..682f00d44a87d8a92770a93f44f9a358c469cd8a 100644 (file)
  */
 
 #include "si_build_pm4.h"
+#include "util/u_upload_mgr.h"
 #include "util/u_viewport.h"
-#include "tgsi/tgsi_scan.h"
 
 #define SI_MAX_SCISSOR 16384
 
+void si_update_ngg_small_prim_precision(struct si_context *ctx)
+{
+       if (!ctx->screen->use_ngg_culling)
+               return;
+
+       /* Set VS_STATE.SMALL_PRIM_PRECISION for NGG culling. */
+       unsigned num_samples = ctx->framebuffer.nr_samples;
+       unsigned quant_mode = ctx->viewports.as_scissor[0].quant_mode;
+       float precision;
+
+       if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+               precision = num_samples / 4096.0;
+       else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+               precision = num_samples / 1024.0;
+       else
+               precision = num_samples / 256.0;
+
+       ctx->current_vs_state &= C_VS_STATE_SMALL_PRIM_PRECISION;
+       ctx->current_vs_state |= S_VS_STATE_SMALL_PRIM_PRECISION(fui(precision) >> 23);
+}
+
+void si_get_small_prim_cull_info(struct si_context *sctx,
+                                struct si_small_prim_cull_info *out)
+{
+       /* This is needed by the small primitive culling, because it's done
+        * in screen space.
+        */
+       struct si_small_prim_cull_info info;
+       unsigned num_samples = sctx->framebuffer.nr_samples;
+       assert(num_samples >= 1);
+
+       info.scale[0] = sctx->viewports.states[0].scale[0];
+       info.scale[1] = sctx->viewports.states[0].scale[1];
+       info.translate[0] = sctx->viewports.states[0].translate[0];
+       info.translate[1] = sctx->viewports.states[0].translate[1];
+
+       /* The viewport shouldn't flip the X axis for the small prim culling to work. */
+       assert(-info.scale[0] + info.translate[0] <= info.scale[0] + info.translate[0]);
+
+       /* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+        * This is because the viewport transformation inverts the clip space
+        * bounding box, so min becomes max, which breaks small primitive
+        * culling.
+        */
+       if (sctx->viewports.y_inverted) {
+               info.scale[1] = -info.scale[1];
+               info.translate[1] = -info.translate[1];
+       }
+
+       /* Scale the framebuffer up, so that samples become pixels and small
+        * primitive culling is the same for all sample counts.
+        * This only works with the standard DX sample positions, because
+        * the samples are evenly spaced on both X and Y axes.
+        */
+       for (unsigned i = 0; i < 2; i++) {
+               info.scale[i] *= num_samples;
+               info.translate[i] *= num_samples;
+       }
+       *out = info;
+}
+
 static void si_set_scissor_states(struct pipe_context *pctx,
                                  unsigned start_slot,
                                  unsigned num_scissors,
@@ -37,13 +98,11 @@ static void si_set_scissor_states(struct pipe_context *pctx,
        int i;
 
        for (i = 0; i < num_scissors; i++)
-               ctx->scissors.states[start_slot + i] = state[i];
+               ctx->scissors[start_slot + i] = state[i];
 
-       if (!ctx->queued.named.rasterizer ||
-           !ctx->queued.named.rasterizer->scissor_enable)
+       if (!ctx->queued.named.rasterizer->scissor_enable)
                return;
 
-       ctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
        si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
@@ -107,10 +166,11 @@ static void si_scissor_make_union(struct si_signed_scissor *out,
        out->miny = MIN2(out->miny, in->miny);
        out->maxx = MAX2(out->maxx, in->maxx);
        out->maxy = MAX2(out->maxy, in->maxy);
+       out->quant_mode = MIN2(out->quant_mode, in->quant_mode);
 }
 
 static void si_emit_one_scissor(struct si_context *ctx,
-                               struct radeon_winsys_cs *cs,
+                               struct radeon_cmdbuf *cs,
                                struct si_signed_scissor *vp_scissor,
                                struct pipe_scissor_state *scissor)
 {
@@ -126,6 +186,18 @@ static void si_emit_one_scissor(struct si_context *ctx,
        if (scissor)
                si_clip_scissor(&final, scissor);
 
+       /* Workaround for a hw bug on GFX6 that occurs when PA_SU_HARDWARE_-
+        * SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
+        */
+       if (ctx->chip_class == GFX6 && (final.maxx == 0 || final.maxy == 0)) {
+               radeon_emit(cs, S_028250_TL_X(1) |
+                               S_028250_TL_Y(1) |
+                               S_028250_WINDOW_OFFSET_DISABLE(1));
+               radeon_emit(cs, S_028254_BR_X(1) |
+                               S_028254_BR_Y(1));
+               return;
+       }
+
        radeon_emit(cs, S_028250_TL_X(final.minx) |
                        S_028250_TL_Y(final.miny) |
                        S_028250_WINDOW_OFFSET_DISABLE(1));
@@ -133,14 +205,12 @@ static void si_emit_one_scissor(struct si_context *ctx,
                        S_028254_BR_Y(final.maxy));
 }
 
-/* the range is [-MAX, MAX] */
-#define SI_MAX_VIEWPORT_RANGE 32768
+#define MAX_PA_SU_HARDWARE_SCREEN_OFFSET 8176
 
 static void si_emit_guardband(struct si_context *ctx)
 {
-       const struct si_signed_scissor *vp_as_scissor;
-       struct si_signed_scissor max_vp_scissor;
-       struct radeon_winsys_cs *cs = ctx->gfx_cs;
+       const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
+       struct si_signed_scissor vp_as_scissor;
        struct pipe_viewport_state vp;
        float left, top, right, bottom, max_range, guardband_x, guardband_y;
        float discard_x, discard_y;
@@ -148,26 +218,65 @@ static void si_emit_guardband(struct si_context *ctx)
        if (ctx->vs_writes_viewport_index) {
                /* Shaders can draw to any viewport. Make a union of all
                 * viewports. */
-               max_vp_scissor = ctx->viewports.as_scissor[0];
+               vp_as_scissor = ctx->viewports.as_scissor[0];
                for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
-                       si_scissor_make_union(&max_vp_scissor,
+                       si_scissor_make_union(&vp_as_scissor,
                                              &ctx->viewports.as_scissor[i]);
                }
-               vp_as_scissor = &max_vp_scissor;
        } else {
-               vp_as_scissor = &ctx->viewports.as_scissor[0];
+               vp_as_scissor = ctx->viewports.as_scissor[0];
        }
 
+       /* Blits don't set the viewport state. The vertex shader determines
+        * the viewport size by scaling the coordinates, so we don't know
+        * how large the viewport is. Assume the worst case.
+        */
+       if (ctx->vs_disables_clipping_viewport)
+               vp_as_scissor.quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+
+       /* Determine the optimal hardware screen offset to center the viewport
+        * within the viewport range in order to maximize the guardband size.
+        */
+       int hw_screen_offset_x = (vp_as_scissor.maxx + vp_as_scissor.minx) / 2;
+       int hw_screen_offset_y = (vp_as_scissor.maxy + vp_as_scissor.miny) / 2;
+
+       /* GFX6-GFX7 need to align the offset to an ubertile consisting of all SEs. */
+       const unsigned hw_screen_offset_alignment =
+               ctx->chip_class >= GFX8 ? 16 : MAX2(ctx->screen->se_tile_repeat, 16);
+
+       /* Indexed by quantization modes */
+       static int max_viewport_size[] = {65535, 16383, 4095};
+
+       /* Ensure that the whole viewport stays representable in
+        * absolute coordinates.
+        * See comment in si_set_viewport_states.
+        */
+       assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] &&
+              vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]);
+
+       hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+       hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+
+       /* Align the screen offset by dropping the low bits. */
+       hw_screen_offset_x &= ~(hw_screen_offset_alignment - 1);
+       hw_screen_offset_y &= ~(hw_screen_offset_alignment - 1);
+
+       /* Apply the offset to center the viewport and maximize the guardband. */
+       vp_as_scissor.minx -= hw_screen_offset_x;
+       vp_as_scissor.maxx -= hw_screen_offset_x;
+       vp_as_scissor.miny -= hw_screen_offset_y;
+       vp_as_scissor.maxy -= hw_screen_offset_y;
+
        /* Reconstruct the viewport transformation from the scissor. */
-       vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0;
-       vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0;
-       vp.scale[0] = vp_as_scissor->maxx - vp.translate[0];
-       vp.scale[1] = vp_as_scissor->maxy - vp.translate[1];
+       vp.translate[0] = (vp_as_scissor.minx + vp_as_scissor.maxx) / 2.0;
+       vp.translate[1] = (vp_as_scissor.miny + vp_as_scissor.maxy) / 2.0;
+       vp.scale[0] = vp_as_scissor.maxx - vp.translate[0];
+       vp.scale[1] = vp_as_scissor.maxy - vp.translate[1];
 
        /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
-       if (vp_as_scissor->minx == vp_as_scissor->maxx)
+       if (vp_as_scissor.minx == vp_as_scissor.maxx)
                vp.scale[0] = 0.5;
-       if (vp_as_scissor->miny == vp_as_scissor->maxy)
+       if (vp_as_scissor.miny == vp_as_scissor.maxy)
                vp.scale[1] = 0.5;
 
        /* Find the biggest guard band that is inside the supported viewport
@@ -177,9 +286,10 @@ static void si_emit_guardband(struct si_context *ctx)
         * This is done by applying the inverse viewport transformation
         * on the viewport limits to get those limits in clip space.
         *
-        * Use a limit one pixel smaller to allow for some precision error.
+        * The viewport range is [-max_viewport_size/2, max_viewport_size/2].
         */
-       max_range = SI_MAX_VIEWPORT_RANGE - 1;
+       assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size));
+       max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2;
        left   = (-max_range - vp.translate[0]) / vp.scale[0];
        right  = ( max_range - vp.translate[0]) / vp.scale[0];
        top    = (-max_range - vp.translate[1]) / vp.scale[1];
@@ -196,7 +306,6 @@ static void si_emit_guardband(struct si_context *ctx)
        if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
                /* When rendering wide points or lines, we need to be more
                 * conservative about when to discard them entirely. */
-               const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
                float pixels;
 
                if (ctx->current_rast_prim == PIPE_PRIM_POINTS)
@@ -214,48 +323,54 @@ static void si_emit_guardband(struct si_context *ctx)
                discard_y = MIN2(discard_y, guardband_y);
        }
 
-       /* If any of the GB registers is updated, all of them must be updated. */
-       radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
-
-       radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
-       radeon_emit(cs, fui(discard_y));   /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
-       radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
-       radeon_emit(cs, fui(discard_x));   /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
+       /* If any of the GB registers is updated, all of them must be updated.
+        * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+        * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+        */
+       unsigned initial_cdw = ctx->gfx_cs->current.cdw;
+       radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+                                   SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
+                                   fui(guardband_y), fui(discard_y),
+                                   fui(guardband_x), fui(discard_x));
+       radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
+                                  SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
+                                  S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
+                                  S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
+       radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
+                                  SI_TRACKED_PA_SU_VTX_CNTL,
+                                  S_028BE4_PIX_CENTER(rs->half_pixel_center) |
+                                  S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
+                                                      vp_as_scissor.quant_mode));
+       if (initial_cdw != ctx->gfx_cs->current.cdw)
+               ctx->context_roll = true;
+
+       si_update_ngg_small_prim_precision(ctx);
 }
 
 static void si_emit_scissors(struct si_context *ctx)
 {
-       struct radeon_winsys_cs *cs = ctx->gfx_cs;
-       struct pipe_scissor_state *states = ctx->scissors.states;
-       unsigned mask = ctx->scissors.dirty_mask;
+       struct radeon_cmdbuf *cs = ctx->gfx_cs;
+       struct pipe_scissor_state *states = ctx->scissors;
        bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
 
        /* The simple case: Only 1 viewport is active. */
        if (!ctx->vs_writes_viewport_index) {
                struct si_signed_scissor *vp = &ctx->viewports.as_scissor[0];
 
-               if (!(mask & 1))
-                       return;
-
                radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
                si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
-               ctx->scissors.dirty_mask &= ~1; /* clear one bit */
                return;
        }
 
-       while (mask) {
-               int start, count, i;
-
-               u_bit_scan_consecutive_range(&mask, &start, &count);
-
-               radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL +
-                                              start * 4 * 2, count * 2);
-               for (i = start; i < start+count; i++) {
-                       si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
-                                           scissor_enabled ? &states[i] : NULL);
-               }
+       /* All registers in the array need to be updated if any of them is changed.
+        * This is a hardware requirement.
+        */
+       radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL,
+                                  SI_MAX_VIEWPORTS * 2);
+       for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+               si_emit_one_scissor(ctx, cs, &ctx->viewports.as_scissor[i],
+                                   scissor_enabled ? &states[i] : NULL);
        }
-       ctx->scissors.dirty_mask = 0;
 }
 
 static void si_set_viewport_states(struct pipe_context *pctx,
@@ -264,21 +379,76 @@ static void si_set_viewport_states(struct pipe_context *pctx,
                                   const struct pipe_viewport_state *state)
 {
        struct si_context *ctx = (struct si_context *)pctx;
-       unsigned mask;
        int i;
 
        for (i = 0; i < num_viewports; i++) {
                unsigned index = start_slot + i;
+               struct si_signed_scissor *scissor = &ctx->viewports.as_scissor[index];
 
                ctx->viewports.states[index] = state[i];
-               si_get_scissor_from_viewport(ctx, &state[i],
-                                            &ctx->viewports.as_scissor[index]);
+
+               si_get_scissor_from_viewport(ctx, &state[i], scissor);
+
+               unsigned w = scissor->maxx - scissor->minx;
+               unsigned h = scissor->maxy - scissor->miny;
+               unsigned max_extent = MAX2(w, h);
+
+               int max_corner = MAX2(scissor->maxx, scissor->maxy);
+
+               unsigned center_x = (scissor->maxx + scissor->minx) / 2;
+               unsigned center_y = (scissor->maxy + scissor->miny) / 2;
+               unsigned max_center = MAX2(center_x, center_y);
+
+               /* PA_SU_HARDWARE_SCREEN_OFFSET can't center viewports whose
+                * center start farther than MAX_PA_SU_HARDWARE_SCREEN_OFFSET.
+                * (for example, a 1x1 viewport in the lower right corner of
+                * 16Kx16K) Such viewports need a greater guardband, so they
+                * have to use a worse quantization mode.
+                */
+               unsigned distance_off_center =
+                       MAX2(0, (int)max_center - MAX_PA_SU_HARDWARE_SCREEN_OFFSET);
+               max_extent += distance_off_center;
+
+               /* Determine the best quantization mode (subpixel precision),
+                * but also leave enough space for the guardband.
+                *
+                * Note that primitive binning requires QUANT_MODE == 16_8 on Vega10
+                * and Raven1 for line and rectangle primitive types to work correctly.
+                * Always use 16_8 if primitive binning is possible to occur.
+                */
+               if ((ctx->family == CHIP_VEGA10 || ctx->family == CHIP_RAVEN) &&
+                   ctx->screen->dpbb_allowed)
+                       max_extent = 16384; /* Use QUANT_MODE == 16_8. */
+
+               /* Another constraint is that all coordinates in the viewport
+                * are representable in fixed point with respect to the
+                * surface origin.
+                *
+                * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given
+                * an offset that would make the upper corner of the viewport
+                * greater than the maximum representable number post
+                * quantization, ie 2^quant_bits.
+                *
+                * This does not matter for 14.10 and 16.8 formats since the
+                * offset is already limited at 8k, but it means we can't use
+                * 12.12 if we are drawing to some pixels outside the lower
+                * 4k x 4k of the render target.
+                */
+
+               if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */
+                       scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH;
+               else if (max_extent <= 4096) /* 16K scanline area for guardband */
+                       scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH;
+               else /* 64K scanline area for guardband */
+                       scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
+       }
+
+       if (start_slot == 0) {
+               ctx->viewports.y_inverted =
+                       -state->scale[1] + state->translate[1] >
+                       state->scale[1] + state->translate[1];
        }
 
-       mask = ((1 << num_viewports) - 1) << start_slot;
-       ctx->viewports.dirty_mask |= mask;
-       ctx->viewports.depth_range_dirty_mask |= mask;
-       ctx->scissors.dirty_mask |= mask;
        si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
        si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
        si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
@@ -287,7 +457,7 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 static void si_emit_one_viewport(struct si_context *ctx,
                                 struct pipe_viewport_state *state)
 {
-       struct radeon_winsys_cs *cs = ctx->gfx_cs;
+       struct radeon_cmdbuf *cs = ctx->gfx_cs;
 
        radeon_emit(cs, fui(state->scale[0]));
        radeon_emit(cs, fui(state->translate[0]));
@@ -299,32 +469,52 @@ static void si_emit_one_viewport(struct si_context *ctx,
 
 static void si_emit_viewports(struct si_context *ctx)
 {
-       struct radeon_winsys_cs *cs = ctx->gfx_cs;
+       struct radeon_cmdbuf *cs = ctx->gfx_cs;
        struct pipe_viewport_state *states = ctx->viewports.states;
-       unsigned mask = ctx->viewports.dirty_mask;
+
+       if (ctx->screen->use_ngg_culling) {
+               /* Set the viewport info for small primitive culling. */
+               struct si_small_prim_cull_info info;
+               si_get_small_prim_cull_info(ctx, &info);
+
+               if (memcmp(&info, &ctx->last_small_prim_cull_info, sizeof(info))) {
+                       unsigned offset = 0;
+
+                       /* Align to 256, because the address is shifted by 8 bits. */
+                       u_upload_data(ctx->b.const_uploader, 0, sizeof(info), 256,
+                                     &info, &offset,
+                                     (struct pipe_resource**)&ctx->small_prim_cull_info_buf);
+
+                       ctx->small_prim_cull_info_address =
+                               ctx->small_prim_cull_info_buf->gpu_address + offset;
+                       ctx->last_small_prim_cull_info = info;
+                       ctx->small_prim_cull_info_dirty = true;
+               }
+
+               if (ctx->small_prim_cull_info_dirty) {
+                       /* This will end up in SGPR6 as (value << 8), shifted by the hw. */
+                       radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->small_prim_cull_info_buf,
+                                                 RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
+                       radeon_set_sh_reg(ctx->gfx_cs, R_00B220_SPI_SHADER_PGM_LO_GS,
+                                         ctx->small_prim_cull_info_address >> 8);
+                       ctx->small_prim_cull_info_dirty = false;
+               }
+       }
 
        /* The simple case: Only 1 viewport is active. */
        if (!ctx->vs_writes_viewport_index) {
-               if (!(mask & 1))
-                       return;
-
                radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
                si_emit_one_viewport(ctx, &states[0]);
-               ctx->viewports.dirty_mask &= ~1; /* clear one bit */
                return;
        }
 
-       while (mask) {
-               int start, count, i;
-
-               u_bit_scan_consecutive_range(&mask, &start, &count);
-
-               radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
-                                              start * 4 * 6, count * 6);
-               for (i = start; i < start+count; i++)
-                       si_emit_one_viewport(ctx, &states[i]);
-       }
-       ctx->viewports.dirty_mask = 0;
+       /* All registers in the array need to be updated if any of them is changed.
+        * This is a hardware requirement.
+        */
+       radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
+                                  0, SI_MAX_VIEWPORTS * 6);
+       for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++)
+               si_emit_one_viewport(ctx, &states[i]);
 }
 
 static inline void
@@ -341,43 +531,34 @@ si_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
 
 static void si_emit_depth_ranges(struct si_context *ctx)
 {
-       struct radeon_winsys_cs *cs = ctx->gfx_cs;
+       struct radeon_cmdbuf *cs = ctx->gfx_cs;
        struct pipe_viewport_state *states = ctx->viewports.states;
-       unsigned mask = ctx->viewports.depth_range_dirty_mask;
        bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
        bool window_space = ctx->vs_disables_clipping_viewport;
        float zmin, zmax;
 
        /* The simple case: Only 1 viewport is active. */
        if (!ctx->vs_writes_viewport_index) {
-               if (!(mask & 1))
-                       return;
-
                si_viewport_zmin_zmax(&states[0], clip_halfz, window_space,
                                      &zmin, &zmax);
 
                radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0, 2);
                radeon_emit(cs, fui(zmin));
                radeon_emit(cs, fui(zmax));
-               ctx->viewports.depth_range_dirty_mask &= ~1; /* clear one bit */
                return;
        }
 
-       while (mask) {
-               int start, count, i;
-
-               u_bit_scan_consecutive_range(&mask, &start, &count);
-
-               radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0 +
-                                          start * 4 * 2, count * 2);
-               for (i = start; i < start+count; i++) {
-                       si_viewport_zmin_zmax(&states[i], clip_halfz, window_space,
-                                             &zmin, &zmax);
-                       radeon_emit(cs, fui(zmin));
-                       radeon_emit(cs, fui(zmax));
-               }
+       /* All registers in the array need to be updated if any of them is changed.
+        * This is a hardware requirement.
+        */
+       radeon_set_context_reg_seq(cs, R_0282D0_PA_SC_VPORT_ZMIN_0,
+                                  SI_MAX_VIEWPORTS * 2);
+       for (unsigned i = 0; i < SI_MAX_VIEWPORTS; i++) {
+               si_viewport_zmin_zmax(&states[i], clip_halfz, window_space,
+                                     &zmin, &zmax);
+               radeon_emit(cs, fui(zmin));
+               radeon_emit(cs, fui(zmax));
        }
-       ctx->viewports.depth_range_dirty_mask = 0;
 }
 
 static void si_emit_viewport_states(struct si_context *ctx)
@@ -398,7 +579,7 @@ static void si_emit_viewport_states(struct si_context *ctx)
  */
 void si_update_vs_viewport_state(struct si_context *ctx)
 {
-       struct tgsi_shader_info *info = si_get_vs_info(ctx);
+       struct si_shader_info *info = si_get_vs_info(ctx);
        bool vs_window_space;
 
        if (!info)
@@ -410,8 +591,6 @@ void si_update_vs_viewport_state(struct si_context *ctx)
 
        if (ctx->vs_disables_clipping_viewport != vs_window_space) {
                ctx->vs_disables_clipping_viewport = vs_window_space;
-               ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-               ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
                si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
                si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
        }
@@ -424,15 +603,91 @@ void si_update_vs_viewport_state(struct si_context *ctx)
        ctx->vs_writes_viewport_index = info->writes_viewport_index;
        si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
 
-       if (!ctx->vs_writes_viewport_index)
+       /* Emit scissors and viewports that were enabled by having
+        * the ViewportIndex output.
+        */
+       if (info->writes_viewport_index) {
+           si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+           si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+       }
+}
+
+static void si_emit_window_rectangles(struct si_context *sctx)
+{
+       /* There are four clipping rectangles. Their corner coordinates are inclusive.
+        * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+        * on whether the pixel is inside cliprects 0-3, respectively. For example,
+        * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+        * the number 3 (binary 0011).
+        *
+        * If CLIPRECT_RULE & (1 << number), the pixel is rasterized.
+        */
+       struct radeon_cmdbuf *cs = sctx->gfx_cs;
+       static const unsigned outside[4] = {
+               /* outside rectangle 0 */
+               V_02820C_OUT |
+               V_02820C_IN_1 |
+               V_02820C_IN_2 |
+               V_02820C_IN_21 |
+               V_02820C_IN_3 |
+               V_02820C_IN_31 |
+               V_02820C_IN_32 |
+               V_02820C_IN_321,
+               /* outside rectangles 0, 1 */
+               V_02820C_OUT |
+               V_02820C_IN_2 |
+               V_02820C_IN_3 |
+               V_02820C_IN_32,
+               /* outside rectangles 0, 1, 2 */
+               V_02820C_OUT |
+               V_02820C_IN_3,
+               /* outside rectangles 0, 1, 2, 3 */
+               V_02820C_OUT,
+       };
+       const unsigned disabled = 0xffff; /* all inside and outside cases */
+       unsigned num_rectangles = sctx->num_window_rectangles;
+       struct pipe_scissor_state *rects = sctx->window_rectangles;
+       unsigned rule;
+
+       assert(num_rectangles <= 4);
+
+       if (num_rectangles == 0)
+               rule = disabled;
+       else if (sctx->window_rectangles_include)
+               rule = ~outside[num_rectangles - 1];
+       else
+               rule = outside[num_rectangles - 1];
+
+       radeon_opt_set_context_reg(sctx, R_02820C_PA_SC_CLIPRECT_RULE,
+                                  SI_TRACKED_PA_SC_CLIPRECT_RULE, rule);
+       if (num_rectangles == 0)
                return;
 
-       if (ctx->scissors.dirty_mask)
-           si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+       radeon_set_context_reg_seq(cs, R_028210_PA_SC_CLIPRECT_0_TL,
+                                  num_rectangles * 2);
+       for (unsigned i = 0; i < num_rectangles; i++) {
+               radeon_emit(cs, S_028210_TL_X(rects[i].minx) |
+                               S_028210_TL_Y(rects[i].miny));
+               radeon_emit(cs, S_028214_BR_X(rects[i].maxx) |
+                               S_028214_BR_Y(rects[i].maxy));
+       }
+}
 
-       if (ctx->viewports.dirty_mask ||
-           ctx->viewports.depth_range_dirty_mask)
-           si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+static void si_set_window_rectangles(struct pipe_context *ctx,
+                                    bool include,
+                                    unsigned num_rectangles,
+                                    const struct pipe_scissor_state *rects)
+{
+       struct si_context *sctx = (struct si_context *)ctx;
+
+       sctx->num_window_rectangles = num_rectangles;
+       sctx->window_rectangles_include = include;
+       if (num_rectangles) {
+               memcpy(sctx->window_rectangles, rects,
+                      sizeof(*rects) * num_rectangles);
+       }
+
+       si_mark_atom_dirty(sctx, &sctx->atoms.s.window_rectangles);
 }
 
 void si_init_viewport_functions(struct si_context *ctx)
@@ -440,7 +695,12 @@ void si_init_viewport_functions(struct si_context *ctx)
        ctx->atoms.s.guardband.emit = si_emit_guardband;
        ctx->atoms.s.scissors.emit = si_emit_scissors;
        ctx->atoms.s.viewports.emit = si_emit_viewport_states;
+       ctx->atoms.s.window_rectangles.emit = si_emit_window_rectangles;
 
        ctx->b.set_scissor_states = si_set_scissor_states;
        ctx->b.set_viewport_states = si_set_viewport_states;
+       ctx->b.set_window_rectangles = si_set_window_rectangles;
+
+       for (unsigned i = 0; i < 16; i++)
+               ctx->viewports.as_scissor[i].quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
 }