X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fcompiler%2Fnir%2Fnir_lower_io.c;h=7cfbe28dce45f8def9c50fa18c464466ab8137ec;hb=480329cf8b319983ee96f2b57bd2648be19e5570;hp=279e6e77b6a67dd107985432382bce8479900383;hpb=133273aa22d09cbb2d53b7bf6a8e1da6302c4f15;p=mesa.git diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 279e6e77b6a..7cfbe28dce4 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -35,6 +35,8 @@ #include "nir_builder.h" #include "nir_deref.h" +#include "util/u_math.h" + struct lower_io_state { void *dead_ctx; nir_builder builder; @@ -93,20 +95,39 @@ global_atomic_for_deref(nir_intrinsic_op deref_op) } } +static nir_intrinsic_op +shared_atomic_for_deref(nir_intrinsic_op deref_op) +{ + switch (deref_op) { +#define OP(O) case nir_intrinsic_deref_##O: return nir_intrinsic_shared_##O; + OP(atomic_exchange) + OP(atomic_comp_swap) + OP(atomic_add) + OP(atomic_imin) + OP(atomic_umin) + OP(atomic_imax) + OP(atomic_umax) + OP(atomic_and) + OP(atomic_or) + OP(atomic_xor) + OP(atomic_fadd) + OP(atomic_fmin) + OP(atomic_fmax) + OP(atomic_fcomp_swap) +#undef OP + default: + unreachable("Invalid shared atomic"); + } +} + void -nir_assign_var_locations(struct exec_list *var_list, unsigned *size, +nir_assign_var_locations(nir_shader *shader, nir_variable_mode mode, + unsigned *size, int (*type_size)(const struct glsl_type *, bool)) { unsigned location = 0; - nir_foreach_variable(var, var_list) { - /* - * UBOs have their own address spaces, so don't count them towards the - * number of global uniforms - */ - if (var->data.mode == nir_var_mem_ubo || var->data.mode == nir_var_mem_ssbo) - continue; - + nir_foreach_variable_with_modes(var, shader, mode) { var->data.driver_location = location; bool bindless_type_size = var->data.mode == nir_var_shader_in || var->data.mode == nir_var_shader_out || @@ -138,6 +159,19 @@ nir_is_per_vertex_io(const nir_variable *var, gl_shader_stage stage) return false; } +static unsigned get_number_of_slots(struct lower_io_state *state, + const nir_variable *var) +{ + const struct glsl_type *type = var->type; + + if (nir_is_per_vertex_io(var, state->builder.shader->info.stage)) { + assert(glsl_type_is_array(type)); + type = glsl_get_array_element(type); + } + + return state->type_size(type, var->data.bindless); +} + static nir_ssa_def * get_io_offset(nir_builder *b, nir_deref_instr *deref, nir_ssa_def **vertex_index, @@ -179,7 +213,7 @@ get_io_offset(nir_builder *b, nir_deref_instr *deref, unsigned size = type_size((*p)->type, bts); nir_ssa_def *mul = - nir_imul_imm(b, nir_ssa_for_src(b, (*p)->arr.index, 1), size); + nir_amul_imm(b, nir_ssa_for_src(b, (*p)->arr.index, 1), size); offset = nir_iadd(b, offset, mul); } else if ((*p)->deref_type == nir_deref_type_struct) { @@ -201,12 +235,14 @@ get_io_offset(nir_builder *b, nir_deref_instr *deref, return offset; } -static nir_intrinsic_instr * -lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, - nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset, - unsigned component, const struct glsl_type *type) +static nir_ssa_def * +emit_load(struct lower_io_state *state, + nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset, + unsigned component, unsigned num_components, unsigned bit_size, + nir_alu_type type) { - const nir_shader *nir = state->builder.shader; + nir_builder *b = &state->builder; + const nir_shader *nir = b->shader; nir_variable_mode mode = var->data.mode; nir_ssa_def *barycentric = NULL; @@ -216,20 +252,25 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->options->use_interpolated_input_intrinsics && var->data.interpolation != INTERP_MODE_FLAT) { - assert(vertex_index == NULL); - - nir_intrinsic_op bary_op; - if (var->data.sample || - (state->options & nir_lower_io_force_sample_interpolation)) - bary_op = nir_intrinsic_load_barycentric_sample; - else if (var->data.centroid) - bary_op = nir_intrinsic_load_barycentric_centroid; - else - bary_op = nir_intrinsic_load_barycentric_pixel; - - barycentric = nir_load_barycentric(&state->builder, bary_op, - var->data.interpolation); - op = nir_intrinsic_load_interpolated_input; + if (var->data.interpolation == INTERP_MODE_EXPLICIT) { + assert(vertex_index != NULL); + op = nir_intrinsic_load_input_vertex; + } else { + assert(vertex_index == NULL); + + nir_intrinsic_op bary_op; + if (var->data.sample || + (state->options & nir_lower_io_force_sample_interpolation)) + bary_op = nir_intrinsic_load_barycentric_sample; + else if (var->data.centroid) + bary_op = nir_intrinsic_load_barycentric_centroid; + else + bary_op = nir_intrinsic_load_barycentric_pixel; + + barycentric = nir_load_barycentric(&state->builder, bary_op, + var->data.interpolation); + op = nir_intrinsic_load_interpolated_input; + } } else { op = vertex_index ? nir_intrinsic_load_per_vertex_input : nir_intrinsic_load_input; @@ -242,16 +283,13 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, case nir_var_uniform: op = nir_intrinsic_load_uniform; break; - case nir_var_mem_shared: - op = nir_intrinsic_load_shared; - break; default: unreachable("Unknown variable mode"); } nir_intrinsic_instr *load = nir_intrinsic_instr_create(state->builder.shader, op); - load->num_components = intrin->num_components; + load->num_components = num_components; nir_intrinsic_set_base(load, var->data.driver_location); if (mode == nir_var_shader_in || mode == nir_var_shader_out) @@ -262,8 +300,17 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, state->type_size(var->type, var->data.bindless)); if (load->intrinsic == nir_intrinsic_load_input || + load->intrinsic == nir_intrinsic_load_input_vertex || load->intrinsic == nir_intrinsic_load_uniform) - nir_intrinsic_set_type(load, nir_get_nir_type_for_glsl_type(type)); + nir_intrinsic_set_type(load, type); + + if (load->intrinsic != nir_intrinsic_load_uniform) { + nir_io_semantics semantics = {0}; + semantics.location = var->data.location; + semantics.num_slots = get_number_of_slots(state, var); + semantics.fb_fetch_output = var->data.fb_fetch_output; + nir_intrinsic_set_io_semantics(load, semantics); + } if (vertex_index) { load->src[0] = nir_src_for_ssa(vertex_index); @@ -275,30 +322,82 @@ lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, load->src[0] = nir_src_for_ssa(offset); } - return load; + nir_ssa_dest_init(&load->instr, &load->dest, + num_components, bit_size, NULL); + nir_builder_instr_insert(b, &load->instr); + + return &load->dest.ssa; } -static nir_intrinsic_instr * -lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state, - nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset, - unsigned component, const struct glsl_type *type) +static nir_ssa_def * +lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state, + nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset, + unsigned component, const struct glsl_type *type) { - nir_variable_mode mode = var->data.mode; + assert(intrin->dest.is_ssa); + if (intrin->dest.ssa.bit_size == 64 && + (state->options & nir_lower_io_lower_64bit_to_32)) { + nir_builder *b = &state->builder; + + const unsigned slot_size = state->type_size(glsl_dvec_type(2), false); + + nir_ssa_def *comp64[4]; + assert(component == 0 || component == 2); + unsigned dest_comp = 0; + while (dest_comp < intrin->dest.ssa.num_components) { + const unsigned num_comps = + MIN2(intrin->dest.ssa.num_components - dest_comp, + (4 - component) / 2); + + nir_ssa_def *data32 = + emit_load(state, vertex_index, var, offset, component, + num_comps * 2, 32, nir_type_uint32); + for (unsigned i = 0; i < num_comps; i++) { + comp64[dest_comp + i] = + nir_pack_64_2x32(b, nir_channels(b, data32, 3 << (i * 2))); + } - nir_intrinsic_op op; - if (mode == nir_var_mem_shared) { - op = nir_intrinsic_store_shared; + /* Only the first store has a component offset */ + component = 0; + dest_comp += num_comps; + offset = nir_iadd_imm(b, offset, slot_size); + } + + return nir_vec(b, comp64, intrin->dest.ssa.num_components); + } else if (intrin->dest.ssa.bit_size == 1) { + /* Booleans are 32-bit */ + assert(glsl_type_is_boolean(type)); + return nir_b2b1(&state->builder, + emit_load(state, vertex_index, var, offset, component, + intrin->dest.ssa.num_components, 32, + nir_type_bool32)); } else { - assert(mode == nir_var_shader_out); - op = vertex_index ? nir_intrinsic_store_per_vertex_output : - nir_intrinsic_store_output; + return emit_load(state, vertex_index, var, offset, component, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, + nir_get_nir_type_for_glsl_type(type)); } +} + +static void +emit_store(struct lower_io_state *state, nir_ssa_def *data, + nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset, + unsigned component, unsigned num_components, + nir_component_mask_t write_mask, nir_alu_type type) +{ + nir_builder *b = &state->builder; + nir_variable_mode mode = var->data.mode; + + assert(mode == nir_var_shader_out); + nir_intrinsic_op op; + op = vertex_index ? nir_intrinsic_store_per_vertex_output : + nir_intrinsic_store_output; nir_intrinsic_instr *store = nir_intrinsic_instr_create(state->builder.shader, op); - store->num_components = intrin->num_components; + store->num_components = num_components; - nir_src_copy(&store->src[0], &intrin->src[1], store); + store->src[0] = nir_src_for_ssa(data); nir_intrinsic_set_base(store, var->data.driver_location); @@ -306,71 +405,121 @@ lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state, nir_intrinsic_set_component(store, component); if (store->intrinsic == nir_intrinsic_store_output) - nir_intrinsic_set_type(store, nir_get_nir_type_for_glsl_type(type)); + nir_intrinsic_set_type(store, type); - nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(intrin)); + nir_intrinsic_set_write_mask(store, write_mask); if (vertex_index) store->src[1] = nir_src_for_ssa(vertex_index); store->src[vertex_index ? 2 : 1] = nir_src_for_ssa(offset); - return store; + unsigned gs_streams = 0; + if (state->builder.shader->info.stage == MESA_SHADER_GEOMETRY) { + if (var->data.stream & NIR_STREAM_PACKED) { + gs_streams = var->data.stream & ~NIR_STREAM_PACKED; + } else { + assert(var->data.stream < 4); + gs_streams = 0; + for (unsigned i = 0; i < num_components; ++i) + gs_streams |= var->data.stream << (2 * i); + } + } + + nir_io_semantics semantics = {0}; + semantics.location = var->data.location; + semantics.num_slots = get_number_of_slots(state, var); + semantics.dual_source_blend_index = var->data.index; + semantics.gs_streams = gs_streams; + nir_intrinsic_set_io_semantics(store, semantics); + + nir_builder_instr_insert(b, &store->instr); } -static nir_intrinsic_instr * -lower_atomic(nir_intrinsic_instr *intrin, struct lower_io_state *state, - nir_variable *var, nir_ssa_def *offset) +static void +lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state, + nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset, + unsigned component, const struct glsl_type *type) { - assert(var->data.mode == nir_var_mem_shared); + assert(intrin->src[1].is_ssa); + if (intrin->src[1].ssa->bit_size == 64 && + (state->options & nir_lower_io_lower_64bit_to_32)) { + nir_builder *b = &state->builder; - nir_intrinsic_op op; - switch (intrin->intrinsic) { -#define OP(O) case nir_intrinsic_deref_##O: op = nir_intrinsic_shared_##O; break; - OP(atomic_exchange) - OP(atomic_comp_swap) - OP(atomic_add) - OP(atomic_imin) - OP(atomic_umin) - OP(atomic_imax) - OP(atomic_umax) - OP(atomic_and) - OP(atomic_or) - OP(atomic_xor) - OP(atomic_fadd) - OP(atomic_fmin) - OP(atomic_fmax) - OP(atomic_fcomp_swap) -#undef OP - default: - unreachable("Invalid atomic"); - } + const unsigned slot_size = state->type_size(glsl_dvec_type(2), false); - nir_intrinsic_instr *atomic = - nir_intrinsic_instr_create(state->builder.shader, op); + assert(component == 0 || component == 2); + unsigned src_comp = 0; + nir_component_mask_t write_mask = nir_intrinsic_write_mask(intrin); + while (src_comp < intrin->num_components) { + const unsigned num_comps = + MIN2(intrin->num_components - src_comp, + (4 - component) / 2); + + if (write_mask & BITFIELD_MASK(num_comps)) { + nir_ssa_def *data = + nir_channels(b, intrin->src[1].ssa, + BITFIELD_RANGE(src_comp, num_comps)); + nir_ssa_def *data32 = nir_bitcast_vector(b, data, 32); + + nir_component_mask_t write_mask32 = 0; + for (unsigned i = 0; i < num_comps; i++) { + if (write_mask & BITFIELD_MASK(num_comps) & (1 << i)) + write_mask32 |= 3 << (i * 2); + } - nir_intrinsic_set_base(atomic, var->data.driver_location); + emit_store(state, data32, vertex_index, var, offset, + component, data32->num_components, write_mask32, + nir_type_uint32); + } - atomic->src[0] = nir_src_for_ssa(offset); - assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == - nir_intrinsic_infos[op].num_srcs); - for (unsigned i = 1; i < nir_intrinsic_infos[op].num_srcs; i++) { - nir_src_copy(&atomic->src[i], &intrin->src[i], atomic); + /* Only the first store has a component offset */ + component = 0; + src_comp += num_comps; + write_mask >>= num_comps; + offset = nir_iadd_imm(b, offset, slot_size); + } + } else if (intrin->dest.ssa.bit_size == 1) { + /* Booleans are 32-bit */ + assert(glsl_type_is_boolean(type)); + nir_ssa_def *b32_val = nir_b2b32(&state->builder, intrin->src[1].ssa); + emit_store(state, b32_val, vertex_index, var, offset, + component, intrin->num_components, + nir_intrinsic_write_mask(intrin), + nir_type_bool32); + } else { + emit_store(state, intrin->src[1].ssa, vertex_index, var, offset, + component, intrin->num_components, + nir_intrinsic_write_mask(intrin), + nir_get_nir_type_for_glsl_type(type)); } - - return atomic; } -static nir_intrinsic_instr * +static nir_ssa_def * lower_interpolate_at(nir_intrinsic_instr *intrin, struct lower_io_state *state, nir_variable *var, nir_ssa_def *offset, unsigned component, const struct glsl_type *type) { + nir_builder *b = &state->builder; assert(var->data.mode == nir_var_shader_in); - /* Ignore interpolateAt() for flat variables - flat is flat. */ - if (var->data.interpolation == INTERP_MODE_FLAT) - return lower_load(intrin, state, NULL, var, offset, component, type); + /* Ignore interpolateAt() for flat variables - flat is flat. Lower + * interpolateAtVertex() for explicit variables. + */ + if (var->data.interpolation == INTERP_MODE_FLAT || + var->data.interpolation == INTERP_MODE_EXPLICIT) { + nir_ssa_def *vertex_index = NULL; + + if (var->data.interpolation == INTERP_MODE_EXPLICIT) { + assert(intrin->intrinsic == nir_intrinsic_interp_deref_at_vertex); + vertex_index = intrin->src[1].ssa; + } + + return lower_load(intrin, state, vertex_index, var, offset, component, type); + } + + /* None of the supported APIs allow interpolation on 64-bit things */ + assert(intrin->dest.is_ssa && intrin->dest.ssa.bit_size <= 32); nir_intrinsic_op bary_op; switch (intrin->intrinsic) { @@ -396,10 +545,11 @@ lower_interpolate_at(nir_intrinsic_instr *intrin, struct lower_io_state *state, nir_intrinsic_set_interp_mode(bary_setup, var->data.interpolation); if (intrin->intrinsic == nir_intrinsic_interp_deref_at_sample || - intrin->intrinsic == nir_intrinsic_interp_deref_at_offset) + intrin->intrinsic == nir_intrinsic_interp_deref_at_offset || + intrin->intrinsic == nir_intrinsic_interp_deref_at_vertex) nir_src_copy(&bary_setup->src[0], &intrin->src[1], bary_setup); - nir_builder_instr_insert(&state->builder, &bary_setup->instr); + nir_builder_instr_insert(b, &bary_setup->instr); nir_intrinsic_instr *load = nir_intrinsic_instr_create(state->builder.shader, @@ -409,10 +559,21 @@ lower_interpolate_at(nir_intrinsic_instr *intrin, struct lower_io_state *state, nir_intrinsic_set_base(load, var->data.driver_location); nir_intrinsic_set_component(load, component); + nir_io_semantics semantics = {0}; + semantics.location = var->data.location; + semantics.num_slots = get_number_of_slots(state, var); + nir_intrinsic_set_io_semantics(load, semantics); + load->src[0] = nir_src_for_ssa(&bary_setup->dest.ssa); load->src[1] = nir_src_for_ssa(offset); - return load; + assert(intrin->dest.is_ssa); + nir_ssa_dest_init(&load->instr, &load->dest, + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, NULL); + nir_builder_instr_insert(b, &load->instr); + + return &load->dest.ssa; } static bool @@ -432,25 +593,12 @@ nir_lower_io_block(nir_block *block, switch (intrin->intrinsic) { case nir_intrinsic_load_deref: case nir_intrinsic_store_deref: - case nir_intrinsic_deref_atomic_add: - case nir_intrinsic_deref_atomic_imin: - case nir_intrinsic_deref_atomic_umin: - case nir_intrinsic_deref_atomic_imax: - case nir_intrinsic_deref_atomic_umax: - case nir_intrinsic_deref_atomic_and: - case nir_intrinsic_deref_atomic_or: - case nir_intrinsic_deref_atomic_xor: - case nir_intrinsic_deref_atomic_exchange: - case nir_intrinsic_deref_atomic_comp_swap: - case nir_intrinsic_deref_atomic_fadd: - case nir_intrinsic_deref_atomic_fmin: - case nir_intrinsic_deref_atomic_fmax: - case nir_intrinsic_deref_atomic_fcomp_swap: /* We can lower the io for this nir instrinsic */ break; case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: /* We can optionally lower these to load_interpolated_input */ if (options->use_interpolated_input_intrinsics) break; @@ -462,16 +610,10 @@ nir_lower_io_block(nir_block *block, nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); nir_variable_mode mode = deref->mode; - + assert(util_is_power_of_two_nonzero(mode)); if ((state->modes & mode) == 0) continue; - if (mode != nir_var_shader_in && - mode != nir_var_shader_out && - mode != nir_var_mem_shared && - mode != nir_var_uniform) - continue; - nir_variable *var = nir_deref_instr_get_variable(deref); b->cursor = nir_before_instr(instr); @@ -489,7 +631,7 @@ nir_lower_io_block(nir_block *block, state->type_size, &component_offset, bindless_type_size); - nir_intrinsic_instr *replacement; + nir_ssa_def *replacement = NULL; switch (intrin->intrinsic) { case nir_intrinsic_load_deref: @@ -498,31 +640,14 @@ nir_lower_io_block(nir_block *block, break; case nir_intrinsic_store_deref: - replacement = lower_store(intrin, state, vertex_index, var, offset, - component_offset, deref->type); - break; - - case nir_intrinsic_deref_atomic_add: - case nir_intrinsic_deref_atomic_imin: - case nir_intrinsic_deref_atomic_umin: - case nir_intrinsic_deref_atomic_imax: - case nir_intrinsic_deref_atomic_umax: - case nir_intrinsic_deref_atomic_and: - case nir_intrinsic_deref_atomic_or: - case nir_intrinsic_deref_atomic_xor: - case nir_intrinsic_deref_atomic_exchange: - case nir_intrinsic_deref_atomic_comp_swap: - case nir_intrinsic_deref_atomic_fadd: - case nir_intrinsic_deref_atomic_fmin: - case nir_intrinsic_deref_atomic_fmax: - case nir_intrinsic_deref_atomic_fcomp_swap: - assert(vertex_index == NULL); - replacement = lower_atomic(intrin, state, var, offset); + lower_store(intrin, state, vertex_index, var, offset, + component_offset, deref->type); break; case nir_intrinsic_interp_deref_at_centroid: case nir_intrinsic_interp_deref_at_sample: case nir_intrinsic_interp_deref_at_offset: + case nir_intrinsic_interp_deref_at_vertex: assert(vertex_index == NULL); replacement = lower_interpolate_at(intrin, state, var, offset, component_offset, deref->type); @@ -532,19 +657,10 @@ nir_lower_io_block(nir_block *block, continue; } - if (nir_intrinsic_infos[intrin->intrinsic].has_dest) { - if (intrin->dest.is_ssa) { - nir_ssa_dest_init(&replacement->instr, &replacement->dest, - intrin->dest.ssa.num_components, - intrin->dest.ssa.bit_size, NULL); - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, - nir_src_for_ssa(&replacement->dest.ssa)); - } else { - nir_dest_copy(&replacement->dest, &intrin->dest, &intrin->instr); - } + if (replacement) { + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, + nir_src_for_ssa(replacement)); } - - nir_instr_insert_before(&intrin->instr, &replacement->instr); nir_instr_remove(&intrin->instr); progress = true; } @@ -567,6 +683,10 @@ nir_lower_io_impl(nir_function_impl *impl, state.type_size = type_size; state.options = options; + ASSERTED nir_variable_mode supported_modes = + nir_var_shader_in | nir_var_shader_out | nir_var_uniform; + assert(!(modes & ~supported_modes)); + nir_foreach_block(block, impl) { progress |= nir_lower_io_block(block, &state); } @@ -578,6 +698,15 @@ nir_lower_io_impl(nir_function_impl *impl, return progress; } +/** Lower load/store_deref intrinsics on I/O variables to offset-based intrinsics + * + * This pass is intended to be used for cross-stage shader I/O and driver- + * managed uniforms to turn deref-based access into a simpler model using + * locations or offsets. For fragment shader inputs, it can optionally turn + * load_deref into an explicit interpolation using barycentrics coming from + * one of the load_barycentric_* intrinsics. This pass requires that all + * deref chains are complete and contain no casts. + */ bool nir_lower_io(nir_shader *shader, nir_variable_mode modes, int (*type_size)(const struct glsl_type *, bool), @@ -608,17 +737,23 @@ build_addr_iadd(nir_builder *b, nir_ssa_def *addr, nir_address_format addr_format, nir_ssa_def *offset) { assert(offset->num_components == 1); - assert(addr->bit_size == offset->bit_size); switch (addr_format) { case nir_address_format_32bit_global: case nir_address_format_64bit_global: case nir_address_format_32bit_offset: + assert(addr->bit_size == offset->bit_size); assert(addr->num_components == 1); return nir_iadd(b, addr, offset); + case nir_address_format_32bit_offset_as_64bit: + assert(addr->num_components == 1); + assert(offset->bit_size == 32); + return nir_u2u64(b, nir_iadd(b, nir_u2u32(b, addr), offset)); + case nir_address_format_64bit_bounded_global: assert(addr->num_components == 4); + assert(addr->bit_size == offset->bit_size); return nir_vec4(b, nir_channel(b, addr, 0), nir_channel(b, addr, 1), nir_channel(b, addr, 2), @@ -626,38 +761,133 @@ build_addr_iadd(nir_builder *b, nir_ssa_def *addr, case nir_address_format_32bit_index_offset: assert(addr->num_components == 2); + assert(addr->bit_size == offset->bit_size); return nir_vec2(b, nir_channel(b, addr, 0), nir_iadd(b, nir_channel(b, addr, 1), offset)); + + case nir_address_format_32bit_index_offset_pack64: + assert(addr->num_components == 1); + assert(offset->bit_size == 32); + return nir_pack_64_2x32_split(b, + nir_iadd(b, nir_unpack_64_2x32_split_x(b, addr), offset), + nir_unpack_64_2x32_split_y(b, addr)); + + case nir_address_format_vec2_index_32bit_offset: + assert(addr->num_components == 3); + assert(offset->bit_size == 32); + return nir_vec3(b, nir_channel(b, addr, 0), nir_channel(b, addr, 1), + nir_iadd(b, nir_channel(b, addr, 2), offset)); + case nir_address_format_logical: unreachable("Unsupported address format"); } unreachable("Invalid address format"); } +static unsigned +addr_get_offset_bit_size(nir_ssa_def *addr, nir_address_format addr_format) +{ + if (addr_format == nir_address_format_32bit_offset_as_64bit || + addr_format == nir_address_format_32bit_index_offset_pack64) + return 32; + return addr->bit_size; +} + static nir_ssa_def * build_addr_iadd_imm(nir_builder *b, nir_ssa_def *addr, nir_address_format addr_format, int64_t offset) { return build_addr_iadd(b, addr, addr_format, - nir_imm_intN_t(b, offset, addr->bit_size)); + nir_imm_intN_t(b, offset, + addr_get_offset_bit_size(addr, addr_format))); +} + +static nir_ssa_def * +build_addr_for_var(nir_builder *b, nir_variable *var, + nir_address_format addr_format) +{ + assert(var->data.mode & (nir_var_uniform | nir_var_mem_shared | + nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_constant)); + + const unsigned num_comps = nir_address_format_num_components(addr_format); + const unsigned bit_size = nir_address_format_bit_size(addr_format); + + switch (addr_format) { + case nir_address_format_32bit_global: + case nir_address_format_64bit_global: { + nir_ssa_def *base_addr; + switch (var->data.mode) { + case nir_var_shader_temp: + base_addr = nir_load_scratch_base_ptr(b, 0, num_comps, bit_size); + break; + + case nir_var_function_temp: + base_addr = nir_load_scratch_base_ptr(b, 1, num_comps, bit_size); + break; + + case nir_var_mem_constant: + base_addr = nir_load_constant_base_ptr(b, num_comps, bit_size); + break; + + default: + unreachable("Unsupported variable mode"); + } + + return build_addr_iadd_imm(b, base_addr, addr_format, + var->data.driver_location); + } + + case nir_address_format_32bit_offset: + assert(var->data.driver_location <= UINT32_MAX); + return nir_imm_int(b, var->data.driver_location); + + case nir_address_format_32bit_offset_as_64bit: + assert(var->data.driver_location <= UINT32_MAX); + return nir_imm_int64(b, var->data.driver_location); + + default: + unreachable("Unsupported address format"); + } } static nir_ssa_def * addr_to_index(nir_builder *b, nir_ssa_def *addr, nir_address_format addr_format) { - assert(addr_format == nir_address_format_32bit_index_offset); - assert(addr->num_components == 2); - return nir_channel(b, addr, 0); + switch (addr_format) { + case nir_address_format_32bit_index_offset: + assert(addr->num_components == 2); + return nir_channel(b, addr, 0); + case nir_address_format_32bit_index_offset_pack64: + return nir_unpack_64_2x32_split_y(b, addr); + case nir_address_format_vec2_index_32bit_offset: + assert(addr->num_components == 3); + return nir_channels(b, addr, 0x3); + default: unreachable("Invalid address format"); + } } static nir_ssa_def * addr_to_offset(nir_builder *b, nir_ssa_def *addr, nir_address_format addr_format) { - assert(addr_format == nir_address_format_32bit_index_offset); - assert(addr->num_components == 2); - return nir_channel(b, addr, 1); + switch (addr_format) { + case nir_address_format_32bit_index_offset: + assert(addr->num_components == 2); + return nir_channel(b, addr, 1); + case nir_address_format_32bit_index_offset_pack64: + return nir_unpack_64_2x32_split_x(b, addr); + case nir_address_format_vec2_index_32bit_offset: + assert(addr->num_components == 3); + return nir_channel(b, addr, 2); + case nir_address_format_32bit_offset: + return addr; + case nir_address_format_32bit_offset_as_64bit: + return nir_u2u32(b, addr); + default: + unreachable("Invalid address format"); + } } /** Returns true if the given address format resolves to a global address */ @@ -669,6 +899,13 @@ addr_format_is_global(nir_address_format addr_format) addr_format == nir_address_format_64bit_bounded_global; } +static bool +addr_format_is_offset(nir_address_format addr_format) +{ + return addr_format == nir_address_format_32bit_offset || + addr_format == nir_address_format_32bit_offset_as_64bit; +} + static nir_ssa_def * addr_to_global(nir_builder *b, nir_ssa_def *addr, nir_address_format addr_format) @@ -685,7 +922,10 @@ addr_to_global(nir_builder *b, nir_ssa_def *addr, nir_u2u64(b, nir_channel(b, addr, 3))); case nir_address_format_32bit_index_offset: + case nir_address_format_32bit_index_offset_pack64: + case nir_address_format_vec2_index_32bit_offset: case nir_address_format_32bit_offset: + case nir_address_format_32bit_offset_as_64bit: case nir_address_format_logical: unreachable("Cannot get a 64-bit address with this address format"); } @@ -731,10 +971,32 @@ build_explicit_io_load(nir_builder *b, nir_intrinsic_instr *intrin, assert(addr_format_is_global(addr_format)); op = nir_intrinsic_load_global; break; - case nir_var_shader_in: - assert(addr_format_is_global(addr_format)); + case nir_var_uniform: + assert(addr_format_is_offset(addr_format)); + assert(b->shader->info.stage == MESA_SHADER_KERNEL); op = nir_intrinsic_load_kernel_input; break; + case nir_var_mem_shared: + assert(addr_format_is_offset(addr_format)); + op = nir_intrinsic_load_shared; + break; + case nir_var_shader_temp: + case nir_var_function_temp: + if (addr_format_is_offset(addr_format)) { + op = nir_intrinsic_load_scratch; + } else { + assert(addr_format_is_global(addr_format)); + op = nir_intrinsic_load_global; + } + break; + case nir_var_mem_constant: + if (addr_format_is_offset(addr_format)) { + op = nir_intrinsic_load_constant; + } else { + assert(addr_format_is_global(addr_format)); + op = nir_intrinsic_load_global_constant; + } + break; default: unreachable("Unsupported explicit IO variable mode"); } @@ -743,47 +1005,75 @@ build_explicit_io_load(nir_builder *b, nir_intrinsic_instr *intrin, if (addr_format_is_global(addr_format)) { load->src[0] = nir_src_for_ssa(addr_to_global(b, addr, addr_format)); + } else if (addr_format_is_offset(addr_format)) { + assert(addr->num_components == 1); + load->src[0] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); } else { load->src[0] = nir_src_for_ssa(addr_to_index(b, addr, addr_format)); load->src[1] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); } - if (mode != nir_var_mem_ubo && mode != nir_var_shader_in) + if (nir_intrinsic_has_access(load)) nir_intrinsic_set_access(load, nir_intrinsic_access(intrin)); + if (op == nir_intrinsic_load_constant) { + nir_intrinsic_set_base(load, 0); + nir_intrinsic_set_range(load, b->shader->constant_data_size); + } + + unsigned bit_size = intrin->dest.ssa.bit_size; + if (bit_size == 1) { + /* TODO: Make the native bool bit_size an option. */ + bit_size = 32; + } + /* TODO: We should try and provide a better alignment. For OpenCL, we need * to plumb the alignment through from SPIR-V when we have one. */ - nir_intrinsic_set_align(load, intrin->dest.ssa.bit_size / 8, 0); + nir_intrinsic_set_align(load, bit_size / 8, 0); assert(intrin->dest.is_ssa); load->num_components = num_components; nir_ssa_dest_init(&load->instr, &load->dest, num_components, - intrin->dest.ssa.bit_size, intrin->dest.ssa.name); + bit_size, intrin->dest.ssa.name); - assert(load->dest.ssa.bit_size % 8 == 0); + assert(bit_size % 8 == 0); + nir_ssa_def *result; if (addr_format_needs_bounds_check(addr_format)) { /* The Vulkan spec for robustBufferAccess gives us quite a few options * as to what we can do with an OOB read. Unfortunately, returning * undefined values isn't one of them so we return an actual zero. */ - nir_ssa_def *zero = nir_imm_zero(b, load->num_components, - load->dest.ssa.bit_size); + nir_ssa_def *zero = nir_imm_zero(b, load->num_components, bit_size); - const unsigned load_size = - (load->dest.ssa.bit_size / 8) * load->num_components; + const unsigned load_size = (bit_size / 8) * load->num_components; nir_push_if(b, addr_is_in_bounds(b, addr, addr_format, load_size)); nir_builder_instr_insert(b, &load->instr); nir_pop_if(b, NULL); - return nir_if_phi(b, &load->dest.ssa, zero); + result = nir_if_phi(b, &load->dest.ssa, zero); } else { nir_builder_instr_insert(b, &load->instr); - return &load->dest.ssa; + result = &load->dest.ssa; } + + if (intrin->dest.ssa.bit_size == 1) { + /* For shared, we can go ahead and use NIR's and/or the back-end's + * standard encoding for booleans rather than forcing a 0/1 boolean. + * This should save an instruction or two. + */ + if (mode == nir_var_mem_shared || + mode == nir_var_shader_temp || + mode == nir_var_function_temp) + result = nir_b2b1(b, result); + else + result = nir_i2b(b, result); + } + + return result; } static void @@ -805,15 +1095,46 @@ build_explicit_io_store(nir_builder *b, nir_intrinsic_instr *intrin, assert(addr_format_is_global(addr_format)); op = nir_intrinsic_store_global; break; + case nir_var_mem_shared: + assert(addr_format_is_offset(addr_format)); + op = nir_intrinsic_store_shared; + break; + case nir_var_shader_temp: + case nir_var_function_temp: + if (addr_format_is_offset(addr_format)) { + op = nir_intrinsic_store_scratch; + } else { + assert(addr_format_is_global(addr_format)); + op = nir_intrinsic_store_global; + } + break; default: unreachable("Unsupported explicit IO variable mode"); } nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, op); + if (value->bit_size == 1) { + /* For shared, we can go ahead and use NIR's and/or the back-end's + * standard encoding for booleans rather than forcing a 0/1 boolean. + * This should save an instruction or two. + * + * TODO: Make the native bool bit_size an option. + */ + if (mode == nir_var_mem_shared || + mode == nir_var_shader_temp || + mode == nir_var_function_temp) + value = nir_b2b32(b, value); + else + value = nir_b2i(b, value, 32); + } + store->src[0] = nir_src_for_ssa(value); if (addr_format_is_global(addr_format)) { store->src[1] = nir_src_for_ssa(addr_to_global(b, addr, addr_format)); + } else if (addr_format_is_offset(addr_format)) { + assert(addr->num_components == 1); + store->src[1] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); } else { store->src[1] = nir_src_for_ssa(addr_to_index(b, addr, addr_format)); store->src[2] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); @@ -821,7 +1142,8 @@ build_explicit_io_store(nir_builder *b, nir_intrinsic_instr *intrin, nir_intrinsic_set_write_mask(store, write_mask); - nir_intrinsic_set_access(store, nir_intrinsic_access(intrin)); + if (nir_intrinsic_has_access(store)) + nir_intrinsic_set_access(store, nir_intrinsic_access(intrin)); /* TODO: We should try and provide a better alignment. For OpenCL, we need * to plumb the alignment through from SPIR-V when we have one. @@ -866,6 +1188,10 @@ build_explicit_io_atomic(nir_builder *b, nir_intrinsic_instr *intrin, assert(addr_format_is_global(addr_format)); op = global_atomic_for_deref(intrin->intrinsic); break; + case nir_var_mem_shared: + assert(addr_format_is_offset(addr_format)); + op = shared_atomic_for_deref(intrin->intrinsic); + break; default: unreachable("Unsupported explicit IO variable mode"); } @@ -875,6 +1201,9 @@ build_explicit_io_atomic(nir_builder *b, nir_intrinsic_instr *intrin, unsigned src = 0; if (addr_format_is_global(addr_format)) { atomic->src[src++] = nir_src_for_ssa(addr_to_global(b, addr, addr_format)); + } else if (addr_format_is_offset(addr_format)) { + assert(addr->num_components == 1); + atomic->src[src++] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); } else { atomic->src[src++] = nir_src_for_ssa(addr_to_index(b, addr, addr_format)); atomic->src[src++] = nir_src_for_ssa(addr_to_offset(b, addr, addr_format)); @@ -886,7 +1215,7 @@ build_explicit_io_atomic(nir_builder *b, nir_intrinsic_instr *intrin, /* Global atomics don't have access flags because they assume that the * address may be non-uniform. */ - if (!addr_format_is_global(addr_format)) + if (nir_intrinsic_has_access(atomic)) nir_intrinsic_set_access(atomic, nir_intrinsic_access(intrin)); assert(intrin->dest.ssa.num_components == 1); @@ -918,33 +1247,24 @@ nir_explicit_io_address_from_deref(nir_builder *b, nir_deref_instr *deref, assert(deref->dest.is_ssa); switch (deref->deref_type) { case nir_deref_type_var: - assert(deref->mode == nir_var_shader_in); - return nir_imm_intN_t(b, deref->var->data.driver_location, - deref->dest.ssa.bit_size); + return build_addr_for_var(b, deref->var, addr_format); case nir_deref_type_array: { - nir_deref_instr *parent = nir_deref_instr_parent(deref); - - unsigned stride = glsl_get_explicit_stride(parent->type); - if ((glsl_type_is_matrix(parent->type) && - glsl_matrix_type_is_row_major(parent->type)) || - (glsl_type_is_vector(parent->type) && stride == 0)) - stride = type_scalar_size_bytes(parent->type); - + unsigned stride = nir_deref_instr_array_stride(deref); assert(stride > 0); nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1); - index = nir_i2i(b, index, base_addr->bit_size); + index = nir_i2i(b, index, addr_get_offset_bit_size(base_addr, addr_format)); return build_addr_iadd(b, base_addr, addr_format, - nir_imul_imm(b, index, stride)); + nir_amul_imm(b, index, stride)); } case nir_deref_type_ptr_as_array: { nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1); - index = nir_i2i(b, index, base_addr->bit_size); - unsigned stride = nir_deref_instr_ptr_as_array_stride(deref); + index = nir_i2i(b, index, addr_get_offset_bit_size(base_addr, addr_format)); + unsigned stride = nir_deref_instr_array_stride(deref); return build_addr_iadd(b, base_addr, addr_format, - nir_imul_imm(b, index, stride)); + nir_amul_imm(b, index, stride)); } case nir_deref_type_array_wildcard: @@ -1024,6 +1344,105 @@ nir_lower_explicit_io_instr(nir_builder *b, nir_instr_remove(&intrin->instr); } +bool +nir_get_explicit_deref_align(nir_deref_instr *deref, + bool default_to_type_align, + uint32_t *align_mul, + uint32_t *align_offset) +{ + if (deref->deref_type == nir_deref_type_var) { + /* If we see a variable, align_mul is effectively infinite because we + * know the offset exactly (up to the offset of the base pointer for the + * given variable mode). We have to pick something so we choose 256B + * as an arbitrary alignment which seems high enough for any reasonable + * wide-load use-case. Back-ends should clamp alignments down if 256B + * is too large for some reason. + */ + *align_mul = 256; + *align_offset = deref->var->data.driver_location % 256; + return true; + } + + /* If we're a cast deref that has an alignment, use that. */ + if (deref->deref_type == nir_deref_type_cast && deref->cast.align_mul > 0) { + *align_mul = deref->cast.align_mul; + *align_offset = deref->cast.align_offset; + return true; + } + + /* Otherwise, we need to compute the alignment based on the parent */ + nir_deref_instr *parent = nir_deref_instr_parent(deref); + if (parent == NULL) { + assert(deref->deref_type == nir_deref_type_cast); + if (default_to_type_align) { + /* If we don't have a parent, assume the type's alignment, if any. */ + unsigned type_align = glsl_get_explicit_alignment(deref->type); + if (type_align == 0) + return false; + + *align_mul = type_align; + *align_offset = 0; + return true; + } else { + return false; + } + } + + uint32_t parent_mul, parent_offset; + if (!nir_get_explicit_deref_align(parent, default_to_type_align, + &parent_mul, &parent_offset)) + return false; + + switch (deref->deref_type) { + case nir_deref_type_var: + unreachable("Handled above"); + + case nir_deref_type_array: + case nir_deref_type_array_wildcard: + case nir_deref_type_ptr_as_array: { + const unsigned stride = nir_deref_instr_array_stride(deref); + if (stride == 0) + return false; + + if (deref->deref_type != nir_deref_type_array_wildcard && + nir_src_is_const(deref->arr.index)) { + unsigned offset = nir_src_as_uint(deref->arr.index) * stride; + *align_mul = parent_mul; + *align_offset = (parent_offset + offset) % parent_mul; + } else { + /* If this is a wildcard or an indirect deref, we have to go with the + * power-of-two gcd. + */ + *align_mul = MIN3(parent_mul, + 1 << (ffs(parent_offset) - 1), + 1 << (ffs(stride) - 1)); + *align_offset = 0; + } + return true; + } + + case nir_deref_type_struct: { + const int offset = glsl_get_struct_field_offset(parent->type, + deref->strct.index); + if (offset < 0) + return false; + + *align_mul = parent_mul; + *align_offset = (parent_offset + offset) % parent_mul; + return true; + } + + case nir_deref_type_cast: + /* We handled the explicit alignment case above. */ + assert(deref->cast.align_mul == 0); + *align_mul = parent_mul; + *align_offset = parent_offset; + return true; + } + + unreachable("Invalid deref_instr_type"); +} + static void lower_explicit_io_deref(nir_builder *b, nir_deref_instr *deref, nir_address_format addr_format) @@ -1033,8 +1452,8 @@ lower_explicit_io_deref(nir_builder *b, nir_deref_instr *deref, * one deref which could break our list walking since we walk the list * backwards. */ - assert(list_empty(&deref->dest.ssa.if_uses)); - if (list_empty(&deref->dest.ssa.uses)) { + assert(list_is_empty(&deref->dest.ssa.if_uses)); + if (list_is_empty(&deref->dest.ssa.uses)) { nir_instr_remove(&deref->instr); return; } @@ -1049,6 +1468,8 @@ lower_explicit_io_deref(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *addr = nir_explicit_io_address_from_deref(b, deref, base_addr, addr_format); + assert(addr->bit_size == deref->dest.ssa.bit_size); + assert(addr->num_components == deref->dest.ssa.num_components); nir_instr_remove(&deref->instr); nir_ssa_def_rewrite_uses(&deref->dest.ssa, nir_src_for_ssa(addr)); @@ -1075,7 +1496,6 @@ lower_explicit_io_array_length(nir_builder *b, nir_intrinsic_instr *intrin, unsigned stride = glsl_get_explicit_stride(deref->type); assert(stride > 0); - assert(addr_format == nir_address_format_32bit_index_offset); nir_ssa_def *addr = &deref->dest.ssa; nir_ssa_def *index = addr_to_index(b, addr, addr_format); nir_ssa_def *offset = addr_to_offset(b, addr, addr_format); @@ -1176,6 +1596,29 @@ nir_lower_explicit_io_impl(nir_function_impl *impl, nir_variable_mode modes, return progress; } +/** Lower explicitly laid out I/O access to byte offset/address intrinsics + * + * This pass is intended to be used for any I/O which touches memory external + * to the shader or which is directly visible to the client. It requires that + * all data types in the given modes have a explicit stride/offset decorations + * to tell it exactly how to calculate the offset/address for the given load, + * store, or atomic operation. If the offset/stride information does not come + * from the client explicitly (as with shared variables in GL or Vulkan), + * nir_lower_vars_to_explicit_types() can be used to add them. + * + * Unlike nir_lower_io, this pass is fully capable of handling incomplete + * pointer chains which may contain cast derefs. It does so by walking the + * deref chain backwards and simply replacing each deref, one at a time, with + * the appropriate address calculation. The pass takes a nir_address_format + * parameter which describes how the offset or address is to be represented + * during calculations. By ensuring that the address is always in a + * consistent format, pointers can safely be conjured from thin air by the + * driver, stored to variables, passed through phis, etc. + * + * The one exception to the simple algorithm described above is for handling + * row-major matrices in which case we may look down one additional level of + * the deref chain. + */ bool nir_lower_explicit_io(nir_shader *shader, nir_variable_mode modes, nir_address_format addr_format) @@ -1191,6 +1634,219 @@ nir_lower_explicit_io(nir_shader *shader, nir_variable_mode modes, return progress; } +static bool +nir_lower_vars_to_explicit_types_impl(nir_function_impl *impl, + nir_variable_mode modes, + glsl_type_size_align_func type_info) +{ + bool progress = false; + + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_deref) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); + if (!(deref->mode & modes)) + continue; + + unsigned size, alignment; + const struct glsl_type *new_type = + glsl_get_explicit_type_for_size_align(deref->type, type_info, &size, &alignment); + if (new_type != deref->type) { + progress = true; + deref->type = new_type; + } + if (deref->deref_type == nir_deref_type_cast) { + /* See also glsl_type::get_explicit_type_for_size_align() */ + unsigned new_stride = align(size, alignment); + if (new_stride != deref->cast.ptr_stride) { + deref->cast.ptr_stride = new_stride; + progress = true; + } + } + } + } + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs | + nir_metadata_loop_analysis); + } + + return progress; +} + +static bool +lower_vars_to_explicit(nir_shader *shader, + struct exec_list *vars, nir_variable_mode mode, + glsl_type_size_align_func type_info) +{ + bool progress = false; + unsigned offset; + switch (mode) { + case nir_var_function_temp: + case nir_var_shader_temp: + offset = shader->scratch_size; + break; + case nir_var_mem_shared: + offset = 0; + break; + case nir_var_mem_constant: + offset = shader->constant_data_size; + break; + default: + unreachable("Unsupported mode"); + } + nir_foreach_variable_in_list(var, vars) { + if (var->data.mode != mode) + continue; + + unsigned size, align; + const struct glsl_type *explicit_type = + glsl_get_explicit_type_for_size_align(var->type, type_info, &size, &align); + + if (explicit_type != var->type) + var->type = explicit_type; + + var->data.driver_location = ALIGN_POT(offset, align); + offset = var->data.driver_location + size; + progress = true; + } + + switch (mode) { + case nir_var_shader_temp: + case nir_var_function_temp: + shader->scratch_size = offset; + break; + case nir_var_mem_shared: + shader->info.cs.shared_size = offset; + shader->shared_size = offset; + break; + case nir_var_mem_constant: + shader->constant_data_size = offset; + break; + default: + unreachable("Unsupported mode"); + } + + return progress; +} + +bool +nir_lower_vars_to_explicit_types(nir_shader *shader, + nir_variable_mode modes, + glsl_type_size_align_func type_info) +{ + /* TODO: Situations which need to be handled to support more modes: + * - row-major matrices + * - compact shader inputs/outputs + * - interface types + */ + ASSERTED nir_variable_mode supported = nir_var_mem_shared | + nir_var_shader_temp | nir_var_function_temp; + assert(!(modes & ~supported) && "unsupported"); + + bool progress = false; + + if (modes & nir_var_mem_shared) + progress |= lower_vars_to_explicit(shader, &shader->variables, nir_var_mem_shared, type_info); + if (modes & nir_var_shader_temp) + progress |= lower_vars_to_explicit(shader, &shader->variables, nir_var_shader_temp, type_info); + + nir_foreach_function(function, shader) { + if (function->impl) { + if (modes & nir_var_function_temp) + progress |= lower_vars_to_explicit(shader, &function->impl->locals, nir_var_function_temp, type_info); + + progress |= nir_lower_vars_to_explicit_types_impl(function->impl, modes, type_info); + } + } + + return progress; +} + +static void +write_constant(void *dst, const nir_constant *c, const struct glsl_type *type) +{ + if (glsl_type_is_vector_or_scalar(type)) { + const unsigned num_components = glsl_get_vector_elements(type); + const unsigned bit_size = glsl_get_bit_size(type); + if (bit_size == 1) { + /* Booleans are special-cased to be 32-bit + * + * TODO: Make the native bool bit_size an option. + */ + for (unsigned i = 0; i < num_components; i++) { + int32_t b32 = -(int)c->values[i].b; + memcpy((char *)dst + i * 4, &b32, 4); + } + } else { + assert(bit_size >= 8 && bit_size % 8 == 0); + const unsigned byte_size = bit_size / 8; + for (unsigned i = 0; i < num_components; i++) { + /* Annoyingly, thanks to packed structs, we can't make any + * assumptions about the alignment of dst. To avoid any strange + * issues with unaligned writes, we always use memcpy. + */ + memcpy((char *)dst + i * byte_size, &c->values[i], byte_size); + } + } + } else if (glsl_type_is_array_or_matrix(type)) { + const unsigned array_len = glsl_get_length(type); + const unsigned stride = glsl_get_explicit_stride(type); + assert(stride > 0); + const struct glsl_type *elem_type = glsl_get_array_element(type); + for (unsigned i = 0; i < array_len; i++) + write_constant((char *)dst + i * stride, c->elements[i], elem_type); + } else { + assert(glsl_type_is_struct_or_ifc(type)); + const unsigned num_fields = glsl_get_length(type); + for (unsigned i = 0; i < num_fields; i++) { + const int field_offset = glsl_get_struct_field_offset(type, i); + assert(field_offset >= 0); + const struct glsl_type *field_type = glsl_get_struct_field(type, i); + write_constant((char *)dst + field_offset, c->elements[i], field_type); + } + } +} + +bool +nir_lower_mem_constant_vars(nir_shader *shader, + glsl_type_size_align_func type_info) +{ + bool progress = false; + + unsigned old_constant_data_size = shader->constant_data_size; + if (lower_vars_to_explicit(shader, &shader->variables, + nir_var_mem_constant, type_info)) { + assert(shader->constant_data_size > old_constant_data_size); + shader->constant_data = rerzalloc_size(shader, shader->constant_data, + old_constant_data_size, + shader->constant_data_size); + + nir_foreach_variable_with_modes(var, shader, nir_var_mem_constant) { + write_constant((char *)shader->constant_data + + var->data.driver_location, + var->constant_initializer, var->type); + } + progress = true; + } + + nir_foreach_function(function, shader) { + if (!function->impl) + continue; + + if (nir_lower_vars_to_explicit_types_impl(function->impl, + nir_var_mem_constant, + type_info)) + progress = true; + } + + return progress; +} + /** * Return the offset source for a load/store intrinsic. */ @@ -1203,11 +1859,41 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr) case nir_intrinsic_load_shared: case nir_intrinsic_load_uniform: case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: case nir_intrinsic_load_scratch: case nir_intrinsic_load_fs_input_interp_deltas: + case nir_intrinsic_shared_atomic_add: + case nir_intrinsic_shared_atomic_and: + case nir_intrinsic_shared_atomic_comp_swap: + case nir_intrinsic_shared_atomic_exchange: + case nir_intrinsic_shared_atomic_fadd: + case nir_intrinsic_shared_atomic_fcomp_swap: + case nir_intrinsic_shared_atomic_fmax: + case nir_intrinsic_shared_atomic_fmin: + case nir_intrinsic_shared_atomic_imax: + case nir_intrinsic_shared_atomic_imin: + case nir_intrinsic_shared_atomic_or: + case nir_intrinsic_shared_atomic_umax: + case nir_intrinsic_shared_atomic_umin: + case nir_intrinsic_shared_atomic_xor: + case nir_intrinsic_global_atomic_add: + case nir_intrinsic_global_atomic_and: + case nir_intrinsic_global_atomic_comp_swap: + case nir_intrinsic_global_atomic_exchange: + case nir_intrinsic_global_atomic_fadd: + case nir_intrinsic_global_atomic_fcomp_swap: + case nir_intrinsic_global_atomic_fmax: + case nir_intrinsic_global_atomic_fmin: + case nir_intrinsic_global_atomic_imax: + case nir_intrinsic_global_atomic_imin: + case nir_intrinsic_global_atomic_or: + case nir_intrinsic_global_atomic_umax: + case nir_intrinsic_global_atomic_umin: + case nir_intrinsic_global_atomic_xor: return &instr->src[0]; case nir_intrinsic_load_ubo: case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_input_vertex: case nir_intrinsic_load_per_vertex_input: case nir_intrinsic_load_per_vertex_output: case nir_intrinsic_load_interpolated_input: @@ -1215,6 +1901,20 @@ nir_get_io_offset_src(nir_intrinsic_instr *instr) case nir_intrinsic_store_shared: case nir_intrinsic_store_global: case nir_intrinsic_store_scratch: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + case nir_intrinsic_ssbo_atomic_fadd: + case nir_intrinsic_ssbo_atomic_fmin: + case nir_intrinsic_ssbo_atomic_fmax: + case nir_intrinsic_ssbo_atomic_fcomp_swap: return &instr->src[1]; case nir_intrinsic_store_ssbo: case nir_intrinsic_store_per_vertex_output: @@ -1253,7 +1953,10 @@ nir_address_format_null_value(nir_address_format addr_format) [nir_address_format_64bit_global] = {{0}}, [nir_address_format_64bit_bounded_global] = {{0}}, [nir_address_format_32bit_index_offset] = {{.u32 = ~0}, {.u32 = ~0}}, + [nir_address_format_32bit_index_offset_pack64] = {{.u64 = ~0ull}}, + [nir_address_format_vec2_index_32bit_offset] = {{.u32 = ~0}, {.u32 = ~0}, {.u32 = ~0}}, [nir_address_format_32bit_offset] = {{.u32 = ~0}}, + [nir_address_format_32bit_offset_as_64bit] = {{.u64 = ~0ull}}, [nir_address_format_logical] = {{.u32 = ~0}}, }; @@ -1270,9 +1973,18 @@ nir_build_addr_ieq(nir_builder *b, nir_ssa_def *addr0, nir_ssa_def *addr1, case nir_address_format_64bit_global: case nir_address_format_64bit_bounded_global: case nir_address_format_32bit_index_offset: + case nir_address_format_vec2_index_32bit_offset: case nir_address_format_32bit_offset: return nir_ball_iequal(b, addr0, addr1); + case nir_address_format_32bit_offset_as_64bit: + assert(addr0->num_components == 1 && addr1->num_components == 1); + return nir_ieq(b, nir_u2u32(b, addr0), nir_u2u32(b, addr1)); + + case nir_address_format_32bit_index_offset_pack64: + assert(addr0->num_components == 1 && addr1->num_components == 1); + return nir_ball_iequal(b, nir_unpack_64_2x32(b, addr0), nir_unpack_64_2x32(b, addr1)); + case nir_address_format_logical: unreachable("Unsupported address format"); } @@ -1288,10 +2000,16 @@ nir_build_addr_isub(nir_builder *b, nir_ssa_def *addr0, nir_ssa_def *addr1, case nir_address_format_32bit_global: case nir_address_format_64bit_global: case nir_address_format_32bit_offset: + case nir_address_format_32bit_index_offset_pack64: assert(addr0->num_components == 1); assert(addr1->num_components == 1); return nir_isub(b, addr0, addr1); + case nir_address_format_32bit_offset_as_64bit: + assert(addr0->num_components == 1); + assert(addr1->num_components == 1); + return nir_u2u64(b, nir_isub(b, nir_u2u32(b, addr0), nir_u2u32(b, addr1))); + case nir_address_format_64bit_bounded_global: return nir_isub(b, addr_to_global(b, addr0, addr_format), addr_to_global(b, addr1, addr_format)); @@ -1302,6 +2020,12 @@ nir_build_addr_isub(nir_builder *b, nir_ssa_def *addr0, nir_ssa_def *addr1, /* Assume the same buffer index. */ return nir_isub(b, nir_channel(b, addr0, 1), nir_channel(b, addr1, 1)); + case nir_address_format_vec2_index_32bit_offset: + assert(addr0->num_components == 3); + assert(addr1->num_components == 3); + /* Assume the same buffer index. */ + return nir_isub(b, nir_channel(b, addr0, 2), nir_channel(b, addr1, 2)); + case nir_address_format_logical: unreachable("Unsupported address format"); } @@ -1327,6 +2051,17 @@ is_output(nir_intrinsic_instr *intrin) intrin->intrinsic == nir_intrinsic_store_per_vertex_output; } +static bool is_dual_slot(nir_intrinsic_instr *intrin) +{ + if (intrin->intrinsic == nir_intrinsic_store_output || + intrin->intrinsic == nir_intrinsic_store_per_vertex_output) { + return nir_src_bit_size(intrin->src[0]) == 64 && + nir_src_num_components(intrin->src[0]) >= 3; + } + + return nir_dest_bit_size(intrin->dest) == 64 && + nir_dest_num_components(intrin->dest) >= 3; +} /** * This pass adds constant offsets to instr->const_index[0] for input/output @@ -1353,7 +2088,16 @@ add_const_offset_to_base_block(nir_block *block, nir_builder *b, nir_src *offset = nir_get_io_offset_src(intrin); if (nir_src_is_const(*offset)) { - intrin->const_index[0] += nir_src_as_uint(*offset); + unsigned off = nir_src_as_uint(*offset); + + nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) + off); + + nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); + sem.location += off; + /* non-indirect indexing should reduce num_slots */ + sem.num_slots = is_dual_slot(intrin) ? 2 : 1; + nir_intrinsic_set_io_semantics(intrin, sem); + b->cursor = nir_before_instr(&intrin->instr); nir_instr_rewrite_src(&intrin->instr, offset, nir_src_for_ssa(nir_imm_int(b, 0)));