From: Rob Clark Date: Thu, 9 Nov 2017 15:57:55 +0000 (-0500) Subject: freedreno/ir3: image support X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9edfc369c04d131b664f6c94a0e249a81a5c0da5;p=mesa.git freedreno/ir3: image support Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 640805a4f68..bd3e0d0cd4a 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -254,6 +254,12 @@ compile_init(struct ir3_compiler *compiler, constoff += align(cnt, 4) / 4; } + if (so->const_layout.image_dims.count > 0) { + unsigned cnt = so->const_layout.image_dims.count; + so->constbase.image_dims = constoff; + constoff += align(cnt, 4) / 4; + } + unsigned num_driver_params = 0; if (so->type == SHADER_VERTEX) { num_driver_params = IR3_DP_VS_COUNT; @@ -1575,6 +1581,254 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) return atomic; } +/* Images get mapped into SSBO/image state (for store/atomic) and texture + * state block (for load). To simplify things, invert the image id and + * map it from end of state block, ie. image 0 becomes num-1, image 1 + * becomes num-2, etc. This potentially avoids needing to re-emit texture + * state when switching shaders. + * + * TODO is max # of samplers and SSBOs the same. This shouldn't be hard- + * coded. Also, since all the gl shader stages (ie. everything but CS) + * share the same SSBO/image state block, this might require some more + * logic if we supported images in anything other than FS.. + */ +static unsigned +get_image_slot(struct ir3_context *ctx, const nir_variable *var) +{ + /* TODO figure out real limit per generation, and don't hardcode: */ + const unsigned max_samplers = 16; + return max_samplers - var->data.driver_location - 1; +} + +static unsigned +get_image_coords(const nir_variable *var) +{ + switch (glsl_get_sampler_dim(glsl_without_array(var->type))) { + case GLSL_SAMPLER_DIM_1D: + case GLSL_SAMPLER_DIM_BUF: + return 1; + break; + case GLSL_SAMPLER_DIM_2D: + case GLSL_SAMPLER_DIM_RECT: + case GLSL_SAMPLER_DIM_EXTERNAL: + case GLSL_SAMPLER_DIM_MS: + return 2; + case GLSL_SAMPLER_DIM_3D: + case GLSL_SAMPLER_DIM_CUBE: + return 3; + default: + unreachable("bad sampler dim"); + return 0; + } +} + +static type_t +get_image_type(const nir_variable *var) +{ + switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) { + case GLSL_TYPE_UINT: + return TYPE_U32; + case GLSL_TYPE_INT: + return TYPE_S32; + case GLSL_TYPE_FLOAT: + return TYPE_F32; + default: + unreachable("bad sampler type."); + return 0; + } +} + +static struct ir3_instruction * +get_image_offset(struct ir3_context *ctx, const nir_variable *var, + struct ir3_instruction * const *coords, bool byteoff) +{ + struct ir3_block *b = ctx->block; + struct ir3_instruction *offset; + unsigned ncoords = get_image_coords(var); + + /* to calculate the byte offset (yes, uggg) we need (up to) three + * const values to know the bytes per pixel, and y and z stride: + */ + unsigned cb = regid(ctx->so->constbase.image_dims, 0) + + ctx->so->const_layout.image_dims.off[var->data.driver_location]; + + debug_assert(ctx->so->const_layout.image_dims.mask & + (1 << var->data.driver_location)); + + /* offset = coords.x * bytes_per_pixel: */ + offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0); + if (ncoords > 1) { + /* offset += coords.y * y_pitch: */ + offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0, + coords[1], 0, offset, 0); + } + if (ncoords > 2) { + /* offset += coords.z * z_pitch: */ + offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0, + coords[2], 0, offset, 0); + } + + if (!byteoff) { + /* Some cases, like atomics, seem to use dword offset instead + * of byte offsets.. blob just puts an extra shr.b in there + * in those cases: + */ + offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0); + } + + return create_collect(b, (struct ir3_instruction*[]){ + offset, + create_immed(b, 0), + }, 2); +} + +/* src[] = { coord, sample_index }. const_index[] = {} */ +static void +emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = intr->variables[0]->var; + struct ir3_instruction *sam; + struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]); + unsigned ncoords = get_image_coords(var); + unsigned tex_idx = get_image_slot(ctx, var); + type_t type = get_image_type(var); + unsigned flags = 0; + + if (ncoords == 3) + flags |= IR3_INSTR_3D; + + sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags, + tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL); + + split_dest(b, dst, sam, 0, 4); +} + +/* src[] = { coord, sample_index, value }. const_index[] = {} */ +static void +emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = intr->variables[0]->var; + struct ir3_instruction *stib, *offset; + struct ir3_instruction * const *value = get_src(ctx, &intr->src[2]); + struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]); + unsigned ncoords = get_image_coords(var); + unsigned tex_idx = get_image_slot(ctx, var); + + /* src0 is value + * src1 is coords + * src2 is 64b byte offset + */ + + offset = get_image_offset(ctx, var, coords, true); + + /* NOTE: stib seems to take byte offset, but stgb.typed can be used + * too and takes a dword offset.. not quite sure yet why blob uses + * one over the other in various cases. + */ + + stib = ir3_STIB(b, create_immed(b, tex_idx), 0, + create_collect(b, value, 4), 0, + create_collect(b, coords, ncoords), 0, + offset, 0); + stib->cat6.iim_val = 4; + stib->cat6.d = ncoords; + stib->cat6.type = get_image_type(var); + stib->cat6.typed = true; + mark_write(ctx, stib); + + array_insert(b, b->keeps, stib); +} + +static void +emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = intr->variables[0]->var; + unsigned ncoords = get_image_coords(var); + unsigned tex_idx = get_image_slot(ctx, var); + struct ir3_instruction *sam, *lod; + unsigned flags = 0; + + if (ncoords == 3) + flags = IR3_INSTR_3D; + + lod = create_immed(b, 0); + sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags, + tex_idx, tex_idx, lod, NULL); + + split_dest(b, dst, sam, 0, ncoords); +} + +/* src[] = { coord, sample_index, value, compare }. const_index[] = {} */ +static struct ir3_instruction * +emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr) +{ + struct ir3_block *b = ctx->block; + const nir_variable *var = intr->variables[0]->var; + struct ir3_instruction *atomic, *image, *src0, *src1, *src2; + struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]); + unsigned ncoords = get_image_coords(var); + + image = create_immed(b, get_image_slot(ctx, var)); + + /* src0 is value (or uvec2(value, compare)) + * src1 is coords + * src2 is 64b byte offset + */ + src0 = get_src(ctx, &intr->src[2])[0]; + src1 = create_collect(b, coords, ncoords); + src2 = get_image_offset(ctx, var, coords, false); + + switch (intr->intrinsic) { + case nir_intrinsic_image_atomic_add: + atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_min: + atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_max: + atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_and: + atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_or: + atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_xor: + atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_exchange: + atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + case nir_intrinsic_image_atomic_comp_swap: + /* for cmpxchg, src0 is [ui]vec2(data, compare): */ + src0 = create_collect(b, (struct ir3_instruction*[]){ + src0, + get_src(ctx, &intr->src[3])[0], + }, 2); + atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0); + break; + default: + unreachable("boo"); + } + + atomic->cat6.iim_val = 1; + atomic->cat6.d = ncoords; + atomic->cat6.type = get_image_type(var); + atomic->cat6.typed = true; + mark_write(ctx, atomic); + + /* even if nothing consume the result, we can't DCE the instruction: */ + array_insert(b, b->keeps, atomic); + + return atomic; +} + static void emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) { @@ -1747,6 +2001,25 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_shared_atomic_comp_swap: dst[0] = emit_intrinsic_atomic_shared(ctx, intr); break; + case nir_intrinsic_image_load: + emit_intrinsic_load_image(ctx, intr, dst); + break; + case nir_intrinsic_image_store: + emit_intrinsic_store_image(ctx, intr); + break; + case nir_intrinsic_image_size: + emit_intrinsic_image_size(ctx, intr, dst); + break; + case nir_intrinsic_image_atomic_add: + case nir_intrinsic_image_atomic_min: + case nir_intrinsic_image_atomic_max: + case nir_intrinsic_image_atomic_and: + case nir_intrinsic_image_atomic_or: + case nir_intrinsic_image_atomic_xor: + case nir_intrinsic_image_atomic_exchange: + case nir_intrinsic_image_atomic_comp_swap: + dst[0] = emit_intrinsic_atomic_image(ctx, intr); + break; case nir_intrinsic_barrier: case nir_intrinsic_memory_barrier: case nir_intrinsic_group_memory_barrier: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index a206837ef84..3f12b68ada1 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -187,6 +187,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) */ ctx->has_samp = true; regmask_set(&needs_sy, n->regs[0]); + } else if (n->opc == OPC_RESINFO) { + regmask_set(&needs_ss, n->regs[0]); + ir3_NOP(block)->flags |= IR3_INSTR_SS; } else if (is_load(n)) { /* seems like ldlv needs (ss) bit instead?? which is odd but * makes a bunch of flat-varying tests start working on a4xx. diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c index 19d05b462e5..7dd24e5f4ee 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c @@ -237,6 +237,15 @@ ir3_nir_scan_driver_consts(nir_shader *shader, layout->ssbo_size.count; layout->ssbo_size.count += 1; /* one const per */ break; + case nir_intrinsic_image_store: + idx = intr->variables[0]->var->data.driver_location; + if (layout->image_dims.mask & (1 << idx)) + break; + layout->image_dims.mask |= (1 << idx); + layout->ssbo_size.off[idx] = + layout->image_dims.count; + layout->image_dims.count += 3; /* three const per */ + break; default: break; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c index 26f291de894..61a336ed7dd 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c @@ -627,6 +627,38 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v, } } +static void +emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si) +{ + uint32_t offset = v->constbase.image_dims; + if (v->constlen > offset) { + uint32_t dims[align(v->const_layout.image_dims.count, 4)]; + unsigned mask = v->const_layout.image_dims.mask; + + while (mask) { + struct pipe_image_view *img; + struct fd_resource *rsc; + unsigned index = u_bit_scan(&mask); + unsigned off = v->const_layout.image_dims.off[index]; + + img = &si->si[index]; + rsc = fd_resource(img->resource); + + dims[off + 0] = rsc->cpp; + if (img->resource->target != PIPE_BUFFER) { + unsigned lvl = img->u.tex.level; + dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp; + dims[off + 2] = rsc->slices[lvl].size0; + } + } + + fd_wfi(ctx->batch, ring); + ctx->emit_const(ring, v->type, offset * 4, + 0, ARRAY_SIZE(dims), dims, NULL); + } +} + static void emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) @@ -752,6 +784,11 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t]; emit_ssbo_sizes(ctx, v, ring, sb); } + + if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) { + struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t]; + emit_image_dims(ctx, v, ring, si); + } } void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h index dd68e69d16c..3886cce5571 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h @@ -63,6 +63,9 @@ enum ir3_driver_param { * * + SSBO sizes: only needed if shader has a get_buffer_size intrinsic * for a given SSBO + * + * + Image dimensions: needed to calculate pixel offset, but only for + * images that have a image_store intrinsic */ struct ir3_driver_const_layout { struct { @@ -74,6 +77,17 @@ struct ir3_driver_const_layout { */ uint32_t off[PIPE_MAX_SHADER_BUFFERS]; } ssbo_size; + + struct { + uint32_t mask; /* bitmask of images that have image_store */ + uint32_t count; /* number of consts allocated */ + /* three const allocated per image which has image_store: + * + cpp (bytes per pixel) + * + pitch (y pitch) + * + array_pitch (z pitch) + */ + uint32_t off[PIPE_MAX_SHADER_IMAGES]; + } image_dims; }; /* Configuration key used to identify a shader variant.. different @@ -295,6 +309,7 @@ struct ir3_shader_variant { unsigned ubo; /* NOTE that a3xx might need a section for SSBO addresses too */ unsigned ssbo_sizes; + unsigned image_dims; unsigned driver_param; unsigned tfbo; unsigned immediate;