src/intel/vulkan/anv_nir_compute_push_layout.c

   1 /*
   2  * Copyright © 2019 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "anv_nir.h"
  25 #include "compiler/brw_nir.h"
  26
  27 void
  28 anv_nir_compute_push_layout(const struct anv_physical_device *pdevice,
  29                             nir_shader *nir,
  30                             struct brw_stage_prog_data *prog_data,
  31                             struct anv_pipeline_bind_map *map,
  32                             void *mem_ctx)
  33 {
  34    memset(map->push_ranges, 0, sizeof(map->push_ranges));
  35
  36    unsigned push_start = UINT_MAX, push_end = 0;
  37    nir_foreach_function(function, nir) {
  38       if (!function->impl)
  39          continue;
  40
  41       nir_foreach_block(block, function->impl) {
  42          nir_foreach_instr(instr, block) {
  43             if (instr->type != nir_instr_type_intrinsic)
  44                continue;
  45
  46             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  47             if (intrin->intrinsic != nir_intrinsic_load_push_constant)
  48                continue;
  49
  50             unsigned base = nir_intrinsic_base(intrin);
  51             unsigned range = nir_intrinsic_range(intrin);
  52             push_start = MIN2(push_start, base);
  53             push_end = MAX2(push_end, base + range);
  54          }
  55       }
  56    }
  57
  58    const bool has_push_intrinsic = push_start <= push_end;
  59
  60    if (nir->info.stage == MESA_SHADER_COMPUTE) {
  61       /* For compute shaders, we always have to have the subgroup ID.  The
  62        * back-end compiler will "helpfully" add it for us in the last push
  63        * constant slot.  Yes, there is an off-by-one error here but that's
  64        * because the back-end will add it so we want to claim the number of
  65        * push constants one dword less than the full amount including
  66        * gl_SubgroupId.
  67        */
  68       assert(push_end <= offsetof(struct anv_push_constants, cs.subgroup_id));
  69       push_end = offsetof(struct anv_push_constants, cs.subgroup_id);
  70    }
  71
  72    /* Align push_start down to a 32B boundary and make it no larger than
  73     * push_end (no push constants is indicated by push_start = UINT_MAX).
  74     */
  75    push_start = MIN2(push_start, push_end);
  76    push_start &= ~31u;
  77
  78    if (has_push_intrinsic) {
  79       nir_foreach_function(function, nir) {
  80          if (!function->impl)
  81             continue;
  82
  83          nir_foreach_block(block, function->impl) {
  84             nir_foreach_instr(instr, block) {
  85                if (instr->type != nir_instr_type_intrinsic)
  86                   continue;
  87
  88                nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  89                if (intrin->intrinsic != nir_intrinsic_load_push_constant)
  90                   continue;
  91
  92                intrin->intrinsic = nir_intrinsic_load_uniform;
  93                nir_intrinsic_set_base(intrin,
  94                                       nir_intrinsic_base(intrin) -
  95                                       push_start);
  96             }
  97          }
  98       }
  99    }
 100
 101    /* For vec4 our push data size needs to be aligned to a vec4 and for
 102     * scalar, it needs to be aligned to a DWORD.
 103     */
 104    const unsigned align =
 105       pdevice->compiler->scalar_stage[nir->info.stage] ? 4 : 16;
 106    nir->num_uniforms = ALIGN(push_end - push_start, align);
 107    prog_data->nr_params = nir->num_uniforms / 4;
 108    prog_data->param = ralloc_array(mem_ctx, uint32_t, prog_data->nr_params);
 109
 110    struct anv_push_range push_constant_range = {
 111       .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS,
 112       .start = push_start / 32,
 113       .length = DIV_ROUND_UP(push_end - push_start, 32),
 114    };
 115
 116    if ((pdevice->info.gen >= 8 || pdevice->info.is_haswell) &&
 117        nir->info.stage != MESA_SHADER_COMPUTE) {
 118       brw_nir_analyze_ubo_ranges(pdevice->compiler, nir, NULL,
 119                                  prog_data->ubo_ranges);
 120
 121       /* We can push at most 64 registers worth of data.  The back-end
 122        * compiler would do this fixup for us but we'd like to calculate
 123        * the push constant layout ourselves.
 124        */
 125       unsigned total_push_regs = push_constant_range.length;
 126       for (unsigned i = 0; i < 4; i++) {
 127          if (total_push_regs + prog_data->ubo_ranges[i].length > 64)
 128             prog_data->ubo_ranges[i].length = 64 - total_push_regs;
 129          total_push_regs += prog_data->ubo_ranges[i].length;
 130       }
 131       assert(total_push_regs <= 64);
 132
 133       /* The Skylake PRM contains the following restriction:
 134        *
 135        *    "The driver must ensure The following case does not occur
 136        *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
 137        *     buffer 3 read length equal to zero committed followed by a
 138        *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
 139        *     zero committed."
 140        *
 141        * To avoid this, we program the buffers in the highest slots.
 142        * This way, slot 0 is only used if slot 3 is also used.
 143        */
 144       int n = 3;
 145
 146       for (int i = 3; i >= 0; i--) {
 147          const struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i];
 148          if (ubo_range->length == 0)
 149             continue;
 150
 151          const struct anv_pipeline_binding *binding =
 152             &map->surface_to_descriptor[ubo_range->block];
 153
 154          map->push_ranges[n--] = (struct anv_push_range) {
 155             .set = binding->set,
 156             .index = binding->index,
 157             .dynamic_offset_index = binding->dynamic_offset_index,
 158             .start = ubo_range->start,
 159             .length = ubo_range->length,
 160          };
 161       }
 162
 163       if (push_constant_range.length > 0)
 164          map->push_ranges[n--] = push_constant_range;
 165    } else {
 166       /* For Ivy Bridge, the push constants packets have a different
 167        * rule that would require us to iterate in the other direction
 168        * and possibly mess around with dynamic state base address.
 169        * Don't bother; just emit regular push constants at n = 0.
 170        *
 171        * In the compute case, we don't have multiple push ranges so it's
 172        * better to just provide one in push_ranges[0].
 173        */
 174       map->push_ranges[0] = push_constant_range;
 175    }
 176 }
 177
 178 void
 179 anv_nir_validate_push_layout(struct brw_stage_prog_data *prog_data,
 180                              struct anv_pipeline_bind_map *map)
 181 {
 182 #ifndef NDEBUG
 183    unsigned prog_data_push_size = DIV_ROUND_UP(prog_data->nr_params, 8);
 184    for (unsigned i = 0; i < 4; i++)
 185       prog_data_push_size += prog_data->ubo_ranges[i].length;
 186
 187    unsigned bind_map_push_size = 0;
 188    for (unsigned i = 0; i < 4; i++)
 189       bind_map_push_size += map->push_ranges[i].length;
 190
 191    /* We could go through everything again but it should be enough to assert
 192     * that they push the same number of registers.  This should alert us if
 193     * the back-end compiler decides to re-arrange stuff or shrink a range.
 194     */
 195    assert(prog_data_push_size == bind_map_push_size);
 196 #endif
 197 }