vc4: Add kernel support for branching in shader validation.
authorEric Anholt <eric@anholt.net>
Tue, 15 Mar 2016 20:53:02 +0000 (13:53 -0700)
committerEric Anholt <eric@anholt.net>
Wed, 13 Jul 2016 00:42:39 +0000 (17:42 -0700)
We're already checking that branch instructions are within the
contents of the shader and the proper PROG_END sequence is present.
The other thing we need in the presence of branching is to verify that
the shader doesn't overflow past the end of the uniforms stream.

To do that, we require that at the start of any basic block reading
uniforms have the following instructions:

load_imm temp, <offset within uniform stream>
add unif_addr, temp, unif

The instructions are generated by userspace, and the kernel verifies
that the load_imm is of the expected offset, and that the add adds it
to a uniform.  We track which uniform in the stream that is, and at
draw call time fix up the uniform stream to have the address of the
start of the shader's uniforms for that draw call.

Signed-off-by: Eric Anholt <eric@anholt.net>
src/gallium/drivers/vc4/kernel/vc4_drv.h
src/gallium/drivers/vc4/kernel/vc4_validate.c
src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c

index 3dccdf377013e25cb5f7c6b4598426490c255dd1..90f45397d83110fa21324544bd63e442388e2d2f 100644 (file)
@@ -147,6 +147,9 @@ struct vc4_validated_shader_info
        uint32_t uniforms_src_size;
        uint32_t num_texture_samples;
        struct vc4_texture_sample_info *texture_samples;
+
+       uint32_t num_uniform_addr_offsets;
+       uint32_t *uniform_addr_offsets;
 };
 
 /* vc4_validate.c */
index c9e3934ab4b1a14bf544ec523038eb42d19dab73..4ef01108b797886c9ddd4b8e694ba29c0451d567 100644 (file)
@@ -800,7 +800,7 @@ validate_gl_shader_rec(struct drm_device *dev,
                uint32_t src_offset = *(uint32_t *)(pkt_u + o);
                uint32_t *texture_handles_u;
                void *uniform_data_u;
-               uint32_t tex;
+               uint32_t tex, uni;
 
                *(uint32_t *)(pkt_v + o) = bo[i]->paddr + src_offset;
 
@@ -838,6 +838,17 @@ validate_gl_shader_rec(struct drm_device *dev,
                        }
                }
 
+               /* Fill in the uniform slots that need this shader's
+                * start-of-uniforms address (used for resetting the uniform
+                * stream in the presence of control flow).
+                */
+               for (uni = 0;
+                    uni < validated_shader->num_uniform_addr_offsets;
+                    uni++) {
+                       uint32_t o = validated_shader->uniform_addr_offsets[uni];
+                       ((uint32_t *)exec->uniforms_v)[o] = exec->uniforms_p;
+               }
+
                *(uint32_t *)(pkt_v + o + 4) = exec->uniforms_p;
 
                exec->uniforms_u += validated_shader->uniforms_src_size;
index 0ea6d073a996ec99183ead2fec3c444156b1d5fa..82717ca554a7f988d3962ac3312e26b360bdf53d 100644 (file)
@@ -40,6 +40,8 @@
 #include "vc4_qpu.h"
 #include "vc4_qpu_defines.h"
 
+#define LIVE_REG_COUNT (32 + 32 + 4)
+
 struct vc4_shader_validation_state {
        /* Current IP being validated. */
        uint32_t ip;
@@ -58,8 +60,9 @@ struct vc4_shader_validation_state {
         *
         * This is used for the validation of direct address memory reads.
         */
-       uint32_t live_min_clamp_offsets[32 + 32 + 4];
-       bool live_max_clamp_regs[32 + 32 + 4];
+       uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
+       bool live_max_clamp_regs[LIVE_REG_COUNT];
+       uint32_t live_immediates[LIVE_REG_COUNT];
 
        /* Bitfield of which IPs are used as branch targets.
         *
@@ -67,6 +70,20 @@ struct vc4_shader_validation_state {
         * points and clearing the texturing/clamping state.
         */
        unsigned long *branch_targets;
+
+       /* Set when entering a basic block, and cleared when the uniform
+        * address update is found.  This is used to make sure that we don't
+        * read uniforms when the address is undefined.
+        */
+       bool needs_uniform_address_update;
+
+       /* Set when we find a backwards branch.  If the branch is backwards,
+        * the taraget is probably doing an address reset to read uniforms,
+        * and so we need to be sure that a uniforms address is present in the
+        * stream, even if the shader didn't need to read uniforms in later
+        * basic blocks.
+        */
+       bool needs_uniform_address_for_loop;
 };
 
 static uint32_t
@@ -228,8 +245,14 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader,
        /* Since direct uses a RADDR uniform reference, it will get counted in
         * check_instruction_reads()
         */
-       if (!is_direct)
+       if (!is_direct) {
+               if (validation_state->needs_uniform_address_update) {
+                       DRM_ERROR("Texturing with undefined uniform address\n");
+                       return false;
+               }
+
                validated_shader->uniforms_size += 4;
+       }
 
        if (submit) {
                if (!record_texture_sample(validated_shader,
@@ -243,6 +266,98 @@ check_tmu_write(struct vc4_validated_shader_info *validated_shader,
        return true;
 }
 
+static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
+{
+       uint32_t o = validated_shader->num_uniform_addr_offsets;
+       uint32_t num_uniforms = validated_shader->uniforms_size / 4;
+
+       validated_shader->uniform_addr_offsets =
+               krealloc(validated_shader->uniform_addr_offsets,
+                        (o + 1) *
+                        sizeof(*validated_shader->uniform_addr_offsets),
+                        GFP_KERNEL);
+       if (!validated_shader->uniform_addr_offsets)
+               return false;
+
+       validated_shader->uniform_addr_offsets[o] = num_uniforms;
+       validated_shader->num_uniform_addr_offsets++;
+
+       return true;
+}
+
+static bool
+validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
+                              struct vc4_shader_validation_state *validation_state,
+                              bool is_mul)
+{
+       uint64_t inst = validation_state->shader[validation_state->ip];
+       u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
+       u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+       u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+       u32 add_lri = raddr_add_a_to_live_reg_index(inst);
+       /* We want our reset to be pointing at whatever uniform follows the
+        * uniforms base address.
+        */
+       u32 expected_offset = validated_shader->uniforms_size + 4;
+
+       /* We only support absolute uniform address changes, and we
+        * require that they be in the current basic block before any
+        * of its uniform reads.
+        *
+        * One could potentially emit more efficient QPU code, by
+        * noticing that (say) an if statement does uniform control
+        * flow for all threads and that the if reads the same number
+        * of uniforms on each side.  However, this scheme is easy to
+        * validate so it's all we allow for now.
+        */
+
+       if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_NONE) {
+               DRM_ERROR("uniforms address change must be "
+                         "normal math\n");
+               return false;
+       }
+
+       if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
+               DRM_ERROR("Uniform address reset must be an ADD.\n");
+               return false;
+       }
+
+       if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
+               DRM_ERROR("Uniform address reset must be unconditional.\n");
+               return false;
+       }
+
+       if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
+           !(inst & QPU_PM)) {
+               DRM_ERROR("No packing allowed on uniforms reset\n");
+               return false;
+       }
+
+       if (add_lri == -1) {
+               DRM_ERROR("First argument of uniform address write must be "
+                         "an immediate value.\n");
+               return false;
+       }
+
+       if (validation_state->live_immediates[add_lri] != expected_offset) {
+               DRM_ERROR("Resetting uniforms with offset %db instead of %db\n",
+                         validation_state->live_immediates[add_lri],
+                         expected_offset);
+               return false;
+       }
+
+       if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+           !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
+               DRM_ERROR("Second argument of uniform address write must be "
+                         "a uniform.\n");
+               return false;
+       }
+
+       validation_state->needs_uniform_address_update = false;
+       validation_state->needs_uniform_address_for_loop = false;
+       return require_uniform_address_uniform(validated_shader);
+}
+
 static bool
 check_reg_write(struct vc4_validated_shader_info *validated_shader,
                struct vc4_shader_validation_state *validation_state,
@@ -252,14 +367,37 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
        uint32_t waddr = (is_mul ?
                          QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
                          QPU_GET_FIELD(inst, QPU_WADDR_ADD));
+       uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+       bool ws = inst & QPU_WS;
+       bool is_b = is_mul ^ ws;
+       u32 lri = waddr_to_live_reg_index(waddr, is_b);
+
+       if (lri != -1) {
+               uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+               uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
+
+               if (sig == QPU_SIG_LOAD_IMM &&
+                   QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
+                   ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
+                    (!is_mul && cond_add == QPU_COND_ALWAYS))) {
+                       validation_state->live_immediates[lri] =
+                               QPU_GET_FIELD(inst, QPU_LOAD_IMM);
+               } else {
+                       validation_state->live_immediates[lri] = ~0;
+               }
+       }
 
        switch (waddr) {
        case QPU_W_UNIFORMS_ADDRESS:
-               /* XXX: We'll probably need to support this for reladdr, but
-                * it's definitely a security-related one.
-                */
-               DRM_ERROR("uniforms address load unsupported\n");
-               return false;
+               if (is_b) {
+                       DRM_ERROR("relative uniforms address change "
+                                 "unsupported\n");
+                       return false;
+               }
+
+               return validate_uniform_address_write(validated_shader,
+                                                     validation_state,
+                                                     is_mul);
 
        case QPU_W_TLB_COLOR_MS:
        case QPU_W_TLB_COLOR_ALL:
@@ -407,9 +545,35 @@ check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
 }
 
 static bool
-check_instruction_reads(uint64_t inst,
-                       struct vc4_validated_shader_info *validated_shader)
+check_branch(uint64_t inst,
+            struct vc4_validated_shader_info *validated_shader,
+            struct vc4_shader_validation_state *validation_state,
+            int ip)
+{
+       int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
+       uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+       uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+
+       if ((int)branch_imm < 0)
+               validation_state->needs_uniform_address_for_loop = true;
+
+       /* We don't want to have to worry about validation of this, and
+        * there's no need for it.
+        */
+       if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
+               DRM_ERROR("branch instruction at %d wrote a register.\n",
+                         validation_state->ip);
+               return false;
+       }
+
+       return true;
+}
+
+static bool
+check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
+                       struct vc4_shader_validation_state *validation_state)
 {
+       uint64_t inst = validation_state->shader[validation_state->ip];
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
@@ -421,6 +585,12 @@ check_instruction_reads(uint64_t inst,
                 * already be OOM.
                 */
                validated_shader->uniforms_size += 4;
+
+               if (validation_state->needs_uniform_address_update) {
+                       DRM_ERROR("Uniform read with undefined uniform "
+                                 "address\n");
+                       return false;
+               }
        }
 
        return true;
@@ -517,6 +687,65 @@ vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
        return true;
 }
 
+/* Resets any known state for the shader, used when we may be branched to from
+ * multiple locations in the program (or at shader start).
+ */
+static void
+reset_validation_state(struct vc4_shader_validation_state *validation_state)
+{
+       int i;
+
+       for (i = 0; i < 8; i++)
+               validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
+
+       for (i = 0; i < LIVE_REG_COUNT; i++) {
+               validation_state->live_min_clamp_offsets[i] = ~0;
+               validation_state->live_max_clamp_regs[i] = false;
+               validation_state->live_immediates[i] = ~0;
+       }
+}
+
+static bool
+texturing_in_progress(struct vc4_shader_validation_state *validation_state)
+{
+       return (validation_state->tmu_write_count[0] != 0 ||
+               validation_state->tmu_write_count[1] != 0);
+}
+
+static bool
+vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
+{
+       uint32_t ip = validation_state->ip;
+
+       if (!test_bit(ip, validation_state->branch_targets))
+               return true;
+
+       if (texturing_in_progress(validation_state)) {
+               DRM_ERROR("Branch target landed during TMU setup\n");
+               return false;
+       }
+
+       /* Reset our live values tracking, since this instruction may have
+        * multiple predecessors.
+        *
+        * One could potentially do analysis to determine that, for
+        * example, all predecessors have a live max clamp in the same
+        * register, but we don't bother with that.
+        */
+       reset_validation_state(validation_state);
+
+       /* Since we've entered a basic block from potentially multiple
+        * predecessors, we need the uniforms address to be updated before any
+        * unforms are read.  We require that after any branch point, the next
+        * uniform to be loaded is a uniform address offset.  That uniform's
+        * offset will be marked by the uniform address register write
+        * validation, or a one-off the end-of-program check.
+        */
+       validation_state->needs_uniform_address_update = true;
+
+       return true;
+}
+
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
@@ -525,16 +754,12 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
        uint32_t ip;
        struct vc4_validated_shader_info *validated_shader = NULL;
        struct vc4_shader_validation_state validation_state;
-       int i;
 
        memset(&validation_state, 0, sizeof(validation_state));
        validation_state.shader = shader_obj->vaddr;
        validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
 
-       for (i = 0; i < 8; i++)
-               validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
-       for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
-               validation_state.live_min_clamp_offsets[i] = ~0;
+       reset_validation_state(&validation_state);
 
        validation_state.branch_targets =
                kcalloc(BITS_TO_LONGS(validation_state.max_ip),
@@ -555,6 +780,9 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 
                validation_state.ip = ip;
 
+               if (!vc4_handle_branch_target(&validation_state))
+                       goto fail;
+
                switch (sig) {
                case QPU_SIG_NONE:
                case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -570,7 +798,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                                goto fail;
                        }
 
-                       if (!check_instruction_reads(inst, validated_shader))
+                       if (!check_instruction_reads(validated_shader,
+                                                    &validation_state))
                                goto fail;
 
                        if (sig == QPU_SIG_PROG_END) {
@@ -588,6 +817,11 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                        }
                        break;
 
+               case QPU_SIG_BRANCH:
+                       if (!check_branch(inst, validated_shader,
+                                         &validation_state, ip))
+                               goto fail;
+                       break;
                default:
                        DRM_ERROR("Unsupported QPU signal %d at "
                                  "instruction %d\n", sig, ip);
@@ -608,6 +842,21 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                goto fail;
        }
 
+       /* If we did a backwards branch and we haven't emitted a uniforms
+        * reset since then, we still need the uniforms stream to have the
+        * uniforms address available so that the backwards branch can do its
+        * uniforms reset.
+        *
+        * We could potentially prove that the backwards branch doesn't
+        * contain any uses of uniforms until program exit, but that doesn't
+        * seem to be worth the trouble.
+        */
+       if (validation_state.needs_uniform_address_for_loop) {
+               if (!require_uniform_address_uniform(validated_shader))
+                       goto fail;
+               validated_shader->uniforms_size += 4;
+       }
+
        /* Again, no chance of integer overflow here because the worst case
         * scenario is 8 bytes of uniforms plus handles per 8-byte
         * instruction.