src/compiler/nir/nir_lower_ubo_vec4.c

   1 /*
   2  * Copyright © 2020 Google LLC
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an
  25  * offset in vec4 units.  This is a fairly common mode of UBO addressing for
  26  * hardware to have, and it gives NIR a chance to optimize the addressing math
  27  * and CSE the loads.
  28  *
  29  * We assume that the UBO loads do not cross a vec4 boundary.  This is true
  30  * for:
  31  * - std140 (GLSL 1.40, GLSL ES)
  32  * - Vulkan "Extended Layout" (the baseline for UBOs)
  33  *
  34  * but not:
  35  *
  36  * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where
  37  *   vec3 arrays are packed tightly.
  38  *
  39  * - Vulkan's scalarBlockLayout optional feature:
  40  *
  41  *   "A member is defined to improperly straddle if either of the following are
  42  *    true:
  43  *
  44  *    • It is a vector with total size less than or equal to 16 bytes, and has
  45  *      Offset decorations placing its first byte at F and its last byte at L
  46  *      where floor(F / 16) != floor(L / 16).
  47  *    • It is a vector with total size greater than 16 bytes and has its Offset
  48  *      decorations placing its first byte at a non-integer multiple of 16.
  49  *
  50  *    [...]
  51  *
  52  *    Unless the scalarBlockLayout feature is enabled on the device:
  53  *
  54  *    • Vectors must not improperly straddle, as defined above."
  55  */
  56
  57 #include "nir.h"
  58 #include "nir_builder.h"
  59
  60 static bool
  61 nir_lower_ubo_vec4_filter(const nir_instr *instr, const void *data)
  62 {
  63    if (instr->type != nir_instr_type_intrinsic)
  64       return false;
  65
  66    return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo;
  67 }
  68
  69 static nir_ssa_def *
  70 nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data)
  71 {
  72    b->cursor = nir_before_instr(instr);
  73
  74    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
  75
  76    nir_ssa_def *byte_offset = nir_ssa_for_src(b, intr->src[1], 1);
  77
  78    nir_intrinsic_instr *load =
  79       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_vec4);
  80    nir_src_copy(&load->src[0], &intr->src[0], &load->instr);
  81    load->src[1] = nir_src_for_ssa(nir_ushr_imm(b, byte_offset, 4));
  82
  83    unsigned align_mul = nir_intrinsic_align_mul(intr);
  84    unsigned align_offset = nir_intrinsic_align_offset(intr);
  85
  86    int chan_size_bytes = intr->dest.ssa.bit_size / 8;
  87    int chans_per_vec4 = 16 / chan_size_bytes;
  88
  89    /* We don't care if someone figured out that things are aligned beyond
  90     * vec4.
  91     */
  92    align_mul = MIN2(align_mul, 16);
  93    align_offset &= 15;
  94    assert(align_offset % chan_size_bytes == 0);
  95
  96    /* We assume that loads don't cross vec4 boundaries, just that we need
  97     * to extract from within the vec4 when we don't have a good alignment.
  98     */
  99    if (intr->num_components == chans_per_vec4) {
 100       align_mul = 16;
 101       align_offset = 0;
 102    }
 103
 104    unsigned num_components = intr->num_components;
 105    bool aligned_mul = align_mul % 16 == 0;
 106    if (!aligned_mul)
 107       num_components = chans_per_vec4;
 108
 109    nir_ssa_dest_init(&load->instr, &load->dest,
 110                      num_components, intr->dest.ssa.bit_size,
 111                      intr->dest.ssa.name);
 112    load->num_components = num_components;
 113    nir_builder_instr_insert(b, &load->instr);
 114
 115    nir_ssa_def *result = &load->dest.ssa;
 116
 117    int align_chan_offset = align_offset / chan_size_bytes;
 118    if (aligned_mul) {
 119       /* For an aligned load, just ask the backend to load from the known
 120        * offset's component.
 121        */
 122       nir_intrinsic_set_component(load, align_chan_offset);
 123    } else {
 124       if (align_mul == 8) {
 125          /* Special case: Loading small vectors from offset % 8 == 0 can be
 126           * done with just one bcsel.
 127           */
 128          nir_component_mask_t low_channels =
 129             BITSET_MASK(intr->num_components) << (align_chan_offset);
 130          nir_component_mask_t high_channels =
 131             low_channels << (8 / chan_size_bytes);
 132          result = nir_bcsel(b,
 133                             nir_i2b(b, nir_iand_imm(b, byte_offset, 8)),
 134                             nir_channels(b, result, high_channels),
 135                             nir_channels(b, result, low_channels));
 136       } else {
 137          /* General fallback case: Per-result-channel bcsel-based extraction
 138           * from the load.
 139           */
 140          assert(align_mul == 4);
 141          assert(align_chan_offset == 0);
 142
 143          nir_ssa_def *component =
 144             nir_iand_imm(b,
 145                          nir_udiv_imm(b, byte_offset, chan_size_bytes),
 146                          chans_per_vec4 - 1);
 147
 148          nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
 149          for (unsigned i = 0; i < intr->num_components; i++) {
 150             channels[i] = nir_vector_extract(b, result,
 151                                              nir_iadd_imm(b, component, i));
 152          }
 153
 154          result = nir_vec(b, channels, intr->num_components);
 155       }
 156    }
 157
 158    return result;
 159 }
 160
 161 bool
 162 nir_lower_ubo_vec4(nir_shader *shader)
 163 {
 164    return nir_shader_lower_instructions(shader,
 165                                         nir_lower_ubo_vec4_filter,
 166                                         nir_lower_ubo_vec4_lower,
 167                                         NULL);
 168 }