prog_data->curb_read_length = uniform_push_length + ubo_push_length;
 
+   uint64_t used = 0;
+
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
                constant_nr = 0;
             }
 
+            assert(constant_nr / 8 < 64);
+            used |= BITFIELD64_BIT(constant_nr / 8);
+
            struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
                                                  constant_nr / 8,
                                                  constant_nr % 8);
       }
    }
 
+   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
+   if (want_zero) {
+      assert(!compiler->compact_params);
+      fs_builder ubld = bld.exec_all().group(8, 0).at(
+         cfg->first_block(), cfg->first_block()->start());
+
+      /* push_reg_mask_param is in 32-bit units */
+      unsigned mask_param = stage_prog_data->push_reg_mask_param;
+      struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
+                                                            mask_param % 8);
+
+      fs_reg b32;
+      for (unsigned i = 0; i < 64; i++) {
+         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+            fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
+            ubld.SHL(horiz_offset(shifted, 8),
+                     byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
+                     brw_imm_v(0x01234567));
+            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
+
+            fs_builder ubld16 = ubld.group(16, 0);
+            b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
+            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
+         }
+
+         if (want_zero & BITFIELD64_BIT(i)) {
+            assert(i < prog_data->curb_read_length);
+            struct brw_reg push_reg =
+               retype(brw_vec8_grf(payload.num_regs + i, 0),
+                      BRW_REGISTER_TYPE_D);
+
+            ubld.AND(push_reg, push_reg, component(b32, i % 16));
+         }
+      }
+
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   }
+
    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
    this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
 }
 
    if (push_ubo_ranges && robust_buffer_access) {
       /* We can't on-the-fly adjust our push ranges because doing so would
        * mess up the layout in the shader.  When robustBufferAccess is
-       * enabled, we have to manually bounds check our pushed UBO accesses.
+       * enabled, we push a mask into the shader indicating which pushed
+       * registers are valid and we zero out the invalid ones at the top of
+       * the shader.
        */
-      const uint32_t ubo_size_start =
-         offsetof(struct anv_push_constants, push_ubo_sizes);
-      const uint32_t ubo_size_end = ubo_size_start + (4 * sizeof(uint32_t));
-      push_start = MIN2(push_start, ubo_size_start);
-      push_end = MAX2(push_end, ubo_size_end);
+      const uint32_t push_reg_mask_start =
+         offsetof(struct anv_push_constants, push_reg_mask);
+      const uint32_t push_reg_mask_end = push_reg_mask_start + sizeof(uint64_t);
+      push_start = MIN2(push_start, push_reg_mask_start);
+      push_end = MAX2(push_end, push_reg_mask_end);
    }
 
    if (nir->info.stage == MESA_SHADER_COMPUTE) {
       .length = DIV_ROUND_UP(push_end - push_start, 32),
    };
 
-   /* Mapping from brw_ubo_range to anv_push_range */
-   int push_range_idx_map[4] = { -1, -1, -1, -1 };
+   if (has_push_intrinsic) {
+      nir_foreach_function(function, nir) {
+         if (!function->impl)
+            continue;
+
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type != nir_instr_type_intrinsic)
+                  continue;
+
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+               switch (intrin->intrinsic) {
+               case nir_intrinsic_load_push_constant:
+                  intrin->intrinsic = nir_intrinsic_load_uniform;
+                  nir_intrinsic_set_base(intrin,
+                                         nir_intrinsic_base(intrin) -
+                                         push_start);
+                  break;
+
+               default:
+                  break;
+               }
+            }
+         }
+      }
+   }
 
    if (push_ubo_ranges) {
       brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
       if (push_constant_range.length > 0)
          map->push_ranges[n++] = push_constant_range;
 
+      if (robust_buffer_access) {
+         const uint32_t push_reg_mask_offset =
+            offsetof(struct anv_push_constants, push_reg_mask);
+         assert(push_reg_mask_offset >= push_start);
+         prog_data->push_reg_mask_param =
+            (push_reg_mask_offset - push_start) / 4;
+      }
+
+      unsigned range_start_reg = push_constant_range.length;
+
       for (int i = 0; i < 4; i++) {
          struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
          if (ubo_range->length == 0)
          const struct anv_pipeline_binding *binding =
             &map->surface_to_descriptor[ubo_range->block];
 
-         push_range_idx_map[i] = n;
          map->push_ranges[n++] = (struct anv_push_range) {
             .set = binding->set,
             .index = binding->index,
             .start = ubo_range->start,
             .length = ubo_range->length,
          };
+
+         /* We only bother to shader-zero pushed client UBOs */
+         if (binding->set < MAX_SETS && robust_buffer_access) {
+            prog_data->zero_push_reg |= BITFIELD64_RANGE(range_start_reg,
+                                                         ubo_range->length);
+         }
+
+         range_start_reg += ubo_range->length;
       }
    } else {
       /* For Ivy Bridge, the push constants packets have a different
       map->push_ranges[0] = push_constant_range;
    }
 
-   if (has_push_intrinsic || (push_ubo_ranges && robust_buffer_access)) {
-      nir_foreach_function(function, nir) {
-         if (!function->impl)
-            continue;
-
-         nir_builder b;
-         nir_builder_init(&b, function->impl);
-
-         nir_foreach_block(block, function->impl) {
-            nir_foreach_instr_safe(instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                  continue;
-
-               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-               switch (intrin->intrinsic) {
-               case nir_intrinsic_load_ubo: {
-                  if (!robust_buffer_access)
-                     break;
-
-                  if (!nir_src_is_const(intrin->src[0]) ||
-                      !nir_src_is_const(intrin->src[1]))
-                     break;
-
-                  uint32_t index = nir_src_as_uint(intrin->src[0]);
-                  uint64_t offset = nir_src_as_uint(intrin->src[1]);
-                  uint32_t size = intrin->num_components *
-                                  (intrin->dest.ssa.bit_size / 8);
-
-                  int ubo_range_idx = -1;
-                  for (unsigned i = 0; i < 4; i++) {
-                     const struct brw_ubo_range *range =
-                        &prog_data->ubo_ranges[i];
-                     if (range->block == index &&
-                         offset + size > range->start * 32 &&
-                         offset < (range->start + range->length) * 32) {
-                        ubo_range_idx = i;
-                        break;
-                     }
-                  }
-
-                  if (ubo_range_idx < 0)
-                     break;
-
-                  b.cursor = nir_after_instr(&intrin->instr);
-
-                  assert(push_range_idx_map[ubo_range_idx] >= 0);
-                  const uint32_t ubo_size_offset =
-                     offsetof(struct anv_push_constants, push_ubo_sizes) +
-                     push_range_idx_map[ubo_range_idx] * sizeof(uint32_t);
-
-                  nir_intrinsic_instr *load_size =
-                     nir_intrinsic_instr_create(b.shader,
-                                                nir_intrinsic_load_uniform);
-                  load_size->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
-                  nir_intrinsic_set_base(load_size,
-                                         ubo_size_offset - push_start);
-                  nir_intrinsic_set_range(load_size, 4);
-                  nir_intrinsic_set_type(load_size, nir_type_uint32);
-                  load_size->num_components = 1;
-                  nir_ssa_dest_init(&load_size->instr, &load_size->dest,
-                                    1, 32, NULL);
-                  nir_builder_instr_insert(&b, &load_size->instr);
-
-                  /* Do the size checks per-component.  Thanks to scalar block
-                   * layout, we could end up with a single vector straddling a
-                   * 32B boundary.
-                   *
-                   * We intentionally push a size starting from the UBO
-                   * binding in the descriptor set rather than starting from
-                   * the started of the pushed range.  This prevents us from
-                   * accidentally flagging things as out-of-bounds due to
-                   * roll-over if a vector access crosses the push range
-                   * boundary.
-                   *
-                   * We align up to 32B so that we can get better CSE.
-                   *
-                   * We check
-                   *
-                   *    offset + size - 1 < push_ubo_sizes[i]
-                   *
-                   * rather than
-                   *
-                   *    offset + size <= push_ubo_sizes[i]
-                   *
-                   * because it properly returns OOB for the case where
-                   * offset + size == 0.
-                   */
-                  nir_const_value last_byte_const[NIR_MAX_VEC_COMPONENTS];
-                  for (unsigned c = 0; c < intrin->dest.ssa.num_components; c++) {
-                     assert(intrin->dest.ssa.bit_size % 8 == 0);
-                     const unsigned comp_size_B = intrin->dest.ssa.bit_size / 8;
-                     const uint32_t comp_last_byte =
-                        align_u32(offset + (c + 1) * comp_size_B,
-                                  ANV_UBO_BOUNDS_CHECK_ALIGNMENT) - 1;
-                     last_byte_const[c] =
-                        nir_const_value_for_uint(comp_last_byte, 32);
-                  }
-                  nir_ssa_def *last_byte =
-                     nir_build_imm(&b, intrin->dest.ssa.num_components, 32,
-                                   last_byte_const);
-                  nir_ssa_def *in_bounds =
-                     nir_ult(&b, last_byte, &load_size->dest.ssa);
-
-                  nir_ssa_def *zero =
-                     nir_imm_zero(&b, intrin->dest.ssa.num_components,
-                                      intrin->dest.ssa.bit_size);
-                  nir_ssa_def *value =
-                     nir_bcsel(&b, in_bounds, &intrin->dest.ssa, zero);
-                  nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa,
-                                                 nir_src_for_ssa(value),
-                                                 value->parent_instr);
-                  break;
-               }
-
-               case nir_intrinsic_load_push_constant:
-                  intrin->intrinsic = nir_intrinsic_load_uniform;
-                  nir_intrinsic_set_base(intrin,
-                                         nir_intrinsic_base(intrin) -
-                                         push_start);
-                  break;
-
-               default:
-                  break;
-               }
-            }
-         }
-      }
-   }
-
    /* Now that we're done computing the push constant portion of the
     * bind map, hash it.  This lets us quickly determine if the actual
     * mapping has changed and not just a no-op pipeline change.