freedreno/ir3: image support
authorRob Clark <robdclark@gmail.com>
Thu, 9 Nov 2017 15:57:55 +0000 (10:57 -0500)
committerRob Clark <robdclark@gmail.com>
Sun, 12 Nov 2017 17:28:59 +0000 (12:28 -0500)
Signed-off-by: Rob Clark <robdclark@gmail.com>
src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
src/gallium/drivers/freedreno/ir3/ir3_legalize.c
src/gallium/drivers/freedreno/ir3/ir3_nir.c
src/gallium/drivers/freedreno/ir3/ir3_shader.c
src/gallium/drivers/freedreno/ir3/ir3_shader.h

index 640805a4f68f980f6ed1c1248b8ac74f2071e658..bd3e0d0cd4a224c0b53ea0caab57c9925e94f4ea 100644 (file)
@@ -254,6 +254,12 @@ compile_init(struct ir3_compiler *compiler,
                constoff += align(cnt, 4) / 4;
        }
 
+       if (so->const_layout.image_dims.count > 0) {
+               unsigned cnt = so->const_layout.image_dims.count;
+               so->constbase.image_dims = constoff;
+               constoff += align(cnt, 4) / 4;
+       }
+
        unsigned num_driver_params = 0;
        if (so->type == SHADER_VERTEX) {
                num_driver_params = IR3_DP_VS_COUNT;
@@ -1575,6 +1581,254 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        return atomic;
 }
 
+/* Images get mapped into SSBO/image state (for store/atomic) and texture
+ * state block (for load).  To simplify things, invert the image id and
+ * map it from end of state block, ie. image 0 becomes num-1, image 1
+ * becomes num-2, etc.  This potentially avoids needing to re-emit texture
+ * state when switching shaders.
+ *
+ * TODO is max # of samplers and SSBOs the same.  This shouldn't be hard-
+ * coded.  Also, since all the gl shader stages (ie. everything but CS)
+ * share the same SSBO/image state block, this might require some more
+ * logic if we supported images in anything other than FS..
+ */
+static unsigned
+get_image_slot(struct ir3_context *ctx, const nir_variable *var)
+{
+       /* TODO figure out real limit per generation, and don't hardcode: */
+       const unsigned max_samplers = 16;
+       return max_samplers - var->data.driver_location - 1;
+}
+
+static unsigned
+get_image_coords(const nir_variable *var)
+{
+       switch (glsl_get_sampler_dim(glsl_without_array(var->type))) {
+       case GLSL_SAMPLER_DIM_1D:
+       case GLSL_SAMPLER_DIM_BUF:
+               return 1;
+               break;
+       case GLSL_SAMPLER_DIM_2D:
+       case GLSL_SAMPLER_DIM_RECT:
+       case GLSL_SAMPLER_DIM_EXTERNAL:
+       case GLSL_SAMPLER_DIM_MS:
+               return 2;
+       case GLSL_SAMPLER_DIM_3D:
+       case GLSL_SAMPLER_DIM_CUBE:
+               return 3;
+       default:
+               unreachable("bad sampler dim");
+               return 0;
+       }
+}
+
+static type_t
+get_image_type(const nir_variable *var)
+{
+       switch (glsl_get_sampler_result_type(glsl_without_array(var->type))) {
+       case GLSL_TYPE_UINT:
+               return TYPE_U32;
+       case GLSL_TYPE_INT:
+               return TYPE_S32;
+       case GLSL_TYPE_FLOAT:
+               return TYPE_F32;
+       default:
+               unreachable("bad sampler type.");
+               return 0;
+       }
+}
+
+static struct ir3_instruction *
+get_image_offset(struct ir3_context *ctx, const nir_variable *var,
+               struct ir3_instruction * const *coords, bool byteoff)
+{
+       struct ir3_block *b = ctx->block;
+       struct ir3_instruction *offset;
+       unsigned ncoords = get_image_coords(var);
+
+       /* to calculate the byte offset (yes, uggg) we need (up to) three
+        * const values to know the bytes per pixel, and y and z stride:
+        */
+       unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+               ctx->so->const_layout.image_dims.off[var->data.driver_location];
+
+       debug_assert(ctx->so->const_layout.image_dims.mask &
+                       (1 << var->data.driver_location));
+
+       /* offset = coords.x * bytes_per_pixel: */
+       offset = ir3_MUL_S(b, coords[0], 0, create_uniform(ctx, cb + 0), 0);
+       if (ncoords > 1) {
+               /* offset += coords.y * y_pitch: */
+               offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 1), 0,
+                               coords[1], 0, offset, 0);
+       }
+       if (ncoords > 2) {
+               /* offset += coords.z * z_pitch: */
+               offset = ir3_MAD_S24(b, create_uniform(ctx, cb + 2), 0,
+                               coords[2], 0, offset, 0);
+       }
+
+       if (!byteoff) {
+               /* Some cases, like atomics, seem to use dword offset instead
+                * of byte offsets.. blob just puts an extra shr.b in there
+                * in those cases:
+                */
+               offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
+       }
+
+       return create_collect(b, (struct ir3_instruction*[]){
+               offset,
+               create_immed(b, 0),
+       }, 2);
+}
+
+/* src[] = { coord, sample_index }. const_index[] = {} */
+static void
+emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = intr->variables[0]->var;
+       struct ir3_instruction *sam;
+       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
+       unsigned ncoords = get_image_coords(var);
+       unsigned tex_idx = get_image_slot(ctx, var);
+       type_t type = get_image_type(var);
+       unsigned flags = 0;
+
+       if (ncoords == 3)
+               flags |= IR3_INSTR_3D;
+
+       sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags,
+                       tex_idx, tex_idx, create_collect(b, coords, ncoords), NULL);
+
+       split_dest(b, dst, sam, 0, 4);
+}
+
+/* src[] = { coord, sample_index, value }. const_index[] = {} */
+static void
+emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = intr->variables[0]->var;
+       struct ir3_instruction *stib, *offset;
+       struct ir3_instruction * const *value = get_src(ctx, &intr->src[2]);
+       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
+       unsigned ncoords = get_image_coords(var);
+       unsigned tex_idx = get_image_slot(ctx, var);
+
+       /* src0 is value
+        * src1 is coords
+        * src2 is 64b byte offset
+        */
+
+       offset = get_image_offset(ctx, var, coords, true);
+
+       /* NOTE: stib seems to take byte offset, but stgb.typed can be used
+        * too and takes a dword offset.. not quite sure yet why blob uses
+        * one over the other in various cases.
+        */
+
+       stib = ir3_STIB(b, create_immed(b, tex_idx), 0,
+                       create_collect(b, value, 4), 0,
+                       create_collect(b, coords, ncoords), 0,
+                       offset, 0);
+       stib->cat6.iim_val = 4;
+       stib->cat6.d = ncoords;
+       stib->cat6.type = get_image_type(var);
+       stib->cat6.typed = true;
+       mark_write(ctx, stib);
+
+       array_insert(b, b->keeps, stib);
+}
+
+static void
+emit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr,
+               struct ir3_instruction **dst)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = intr->variables[0]->var;
+       unsigned ncoords = get_image_coords(var);
+       unsigned tex_idx = get_image_slot(ctx, var);
+       struct ir3_instruction *sam, *lod;
+       unsigned flags = 0;
+
+       if (ncoords == 3)
+               flags = IR3_INSTR_3D;
+
+       lod = create_immed(b, 0);
+       sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
+                       tex_idx, tex_idx, lod, NULL);
+
+       split_dest(b, dst, sam, 0, ncoords);
+}
+
+/* src[] = { coord, sample_index, value, compare }. const_index[] = {} */
+static struct ir3_instruction *
+emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
+{
+       struct ir3_block *b = ctx->block;
+       const nir_variable *var = intr->variables[0]->var;
+       struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
+       struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
+       unsigned ncoords = get_image_coords(var);
+
+       image = create_immed(b, get_image_slot(ctx, var));
+
+       /* src0 is value (or uvec2(value, compare))
+        * src1 is coords
+        * src2 is 64b byte offset
+        */
+       src0 = get_src(ctx, &intr->src[2])[0];
+       src1 = create_collect(b, coords, ncoords);
+       src2 = get_image_offset(ctx, var, coords, false);
+
+       switch (intr->intrinsic) {
+       case nir_intrinsic_image_atomic_add:
+               atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_min:
+               atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_max:
+               atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_and:
+               atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_or:
+               atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_xor:
+               atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_exchange:
+               atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       case nir_intrinsic_image_atomic_comp_swap:
+               /* for cmpxchg, src0 is [ui]vec2(data, compare): */
+               src0 = create_collect(b, (struct ir3_instruction*[]){
+                       src0,
+                       get_src(ctx, &intr->src[3])[0],
+               }, 2);
+               atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
+               break;
+       default:
+               unreachable("boo");
+       }
+
+       atomic->cat6.iim_val = 1;
+       atomic->cat6.d = ncoords;
+       atomic->cat6.type = get_image_type(var);
+       atomic->cat6.typed = true;
+       mark_write(ctx, atomic);
+
+       /* even if nothing consume the result, we can't DCE the instruction: */
+       array_insert(b, b->keeps, atomic);
+
+       return atomic;
+}
+
 static void
 emit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
@@ -1747,6 +2001,25 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
        case nir_intrinsic_shared_atomic_comp_swap:
                dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
                break;
+       case nir_intrinsic_image_load:
+               emit_intrinsic_load_image(ctx, intr, dst);
+               break;
+       case nir_intrinsic_image_store:
+               emit_intrinsic_store_image(ctx, intr);
+               break;
+       case nir_intrinsic_image_size:
+               emit_intrinsic_image_size(ctx, intr, dst);
+               break;
+       case nir_intrinsic_image_atomic_add:
+       case nir_intrinsic_image_atomic_min:
+       case nir_intrinsic_image_atomic_max:
+       case nir_intrinsic_image_atomic_and:
+       case nir_intrinsic_image_atomic_or:
+       case nir_intrinsic_image_atomic_xor:
+       case nir_intrinsic_image_atomic_exchange:
+       case nir_intrinsic_image_atomic_comp_swap:
+               dst[0] = emit_intrinsic_atomic_image(ctx, intr);
+               break;
        case nir_intrinsic_barrier:
        case nir_intrinsic_memory_barrier:
        case nir_intrinsic_group_memory_barrier:
index a206837ef8431dcd26aba15701b206f3a1941f90..3f12b68ada19838f06008afa978003177e6c08ff 100644 (file)
@@ -187,6 +187,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                         */
                        ctx->has_samp = true;
                        regmask_set(&needs_sy, n->regs[0]);
+               } else if (n->opc == OPC_RESINFO) {
+                       regmask_set(&needs_ss, n->regs[0]);
+                       ir3_NOP(block)->flags |= IR3_INSTR_SS;
                } else if (is_load(n)) {
                        /* seems like ldlv needs (ss) bit instead??  which is odd but
                         * makes a bunch of flat-varying tests start working on a4xx.
index 19d05b462e5e5bde2528a9ec7c5b375618d311b5..7dd24e5f4eeb64f796f6530179a77cd7fd5f3dbc 100644 (file)
@@ -237,6 +237,15 @@ ir3_nir_scan_driver_consts(nir_shader *shader,
                                                layout->ssbo_size.count;
                                        layout->ssbo_size.count += 1; /* one const per */
                                        break;
+                               case nir_intrinsic_image_store:
+                                       idx = intr->variables[0]->var->data.driver_location;
+                                       if (layout->image_dims.mask & (1 << idx))
+                                               break;
+                                       layout->image_dims.mask |= (1 << idx);
+                                       layout->ssbo_size.off[idx] =
+                                               layout->image_dims.count;
+                                       layout->image_dims.count += 3; /* three const per */
+                                       break;
                                default:
                                        break;
                                }
index 26f291de89430e8d199fc53b89bdabba1dd4c9c0..61a336ed7ddb64a6dcc8800885c1eb69c43304fb 100644 (file)
@@ -627,6 +627,38 @@ emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
        }
 }
 
+static void
+emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
+               struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si)
+{
+       uint32_t offset = v->constbase.image_dims;
+       if (v->constlen > offset) {
+               uint32_t dims[align(v->const_layout.image_dims.count, 4)];
+               unsigned mask = v->const_layout.image_dims.mask;
+
+               while (mask) {
+                       struct pipe_image_view *img;
+                       struct fd_resource *rsc;
+                       unsigned index = u_bit_scan(&mask);
+                       unsigned off = v->const_layout.image_dims.off[index];
+
+                       img = &si->si[index];
+                       rsc = fd_resource(img->resource);
+
+                       dims[off + 0] = rsc->cpp;
+                       if (img->resource->target != PIPE_BUFFER) {
+                               unsigned lvl = img->u.tex.level;
+                               dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp;
+                               dims[off + 2] = rsc->slices[lvl].size0;
+                       }
+               }
+
+               fd_wfi(ctx->batch, ring);
+               ctx->emit_const(ring, v->type, offset * 4,
+                       0, ARRAY_SIZE(dims), dims, NULL);
+       }
+}
+
 static void
 emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
                struct fd_ringbuffer *ring)
@@ -752,6 +784,11 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
                struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t];
                emit_ssbo_sizes(ctx, v, ring, sb);
        }
+
+       if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) {
+               struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t];
+               emit_image_dims(ctx, v, ring, si);
+       }
 }
 
 void
index dd68e69d16cae4afa08b1f4e354feac7720ec7b7..3886cce55715d116fd1e5c403463045ec5faf811 100644 (file)
@@ -63,6 +63,9 @@ enum ir3_driver_param {
  *
  *   + SSBO sizes: only needed if shader has a get_buffer_size intrinsic
  *     for a given SSBO
+ *
+ *   + Image dimensions: needed to calculate pixel offset, but only for
+ *     images that have a image_store intrinsic
  */
 struct ir3_driver_const_layout {
        struct {
@@ -74,6 +77,17 @@ struct ir3_driver_const_layout {
                 */
                uint32_t off[PIPE_MAX_SHADER_BUFFERS];
        } ssbo_size;
+
+       struct {
+               uint32_t mask;  /* bitmask of images that have image_store */
+               uint32_t count; /* number of consts allocated */
+               /* three const allocated per image which has image_store:
+                *  + cpp         (bytes per pixel)
+                *  + pitch       (y pitch)
+                *  + array_pitch (z pitch)
+                */
+               uint32_t off[PIPE_MAX_SHADER_IMAGES];
+       } image_dims;
 };
 
 /* Configuration key used to identify a shader variant.. different
@@ -295,6 +309,7 @@ struct ir3_shader_variant {
                unsigned ubo;
                /* NOTE that a3xx might need a section for SSBO addresses too */
                unsigned ssbo_sizes;
+               unsigned image_dims;
                unsigned driver_param;
                unsigned tfbo;
                unsigned immediate;