From: Eric Anholt <eric@anholt.net>
Date: Mon, 21 Jul 2014 18:27:35 +0000 (-0700)
Subject: vc4: Rewrite the kernel ABI to support texture uniform relocation.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a8f2bf0f51222a96a49dfb3d6f9b36d3e54d08cd;p=mesa.git

vc4: Rewrite the kernel ABI to support texture uniform relocation.

This required building a shader parser that would walk the program to find
where the texturing-related uniforms are in the uniforms stream.

Note that as of this commit, a new kernel is required for rendering on
actual VC4 hardware (currently that commit is named "drm/vc4: Introduce
shader validation and better command stream validation.", but is likely to
be squashed as part of an eventual merge of the kernel driver).
---

diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index ee351835896..414a64ab472 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -17,5 +17,6 @@ C_SOURCES := \
 	vc4_screen.c \
 	vc4_simulator.c \
 	vc4_simulator_validate.c \
+	vc4_simulator_validate_shaders.c \
 	vc4_state.c \
 	$()
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index a9fa7ef70f1..08e85ed6312 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -107,6 +107,8 @@ vc4_flush(struct pipe_context *pctx)
         submit.shader_records = vc4->shader_rec.base;
         submit.shader_record_len = vc4->shader_rec.next - vc4->shader_rec.base;
         submit.shader_record_count = vc4->shader_rec_count;
+        submit.uniforms = vc4->uniforms.base;
+        submit.uniforms_len = vc4->uniforms.next - vc4->uniforms.base;
 
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
@@ -123,6 +125,7 @@ vc4_flush(struct pipe_context *pctx)
         vc4_reset_cl(&vc4->bcl);
         vc4_reset_cl(&vc4->rcl);
         vc4_reset_cl(&vc4->shader_rec);
+        vc4_reset_cl(&vc4->uniforms);
         vc4_reset_cl(&vc4->bo_handles);
 #ifdef USE_VC4_SIMULATOR
         vc4_reset_cl(&vc4->bo_pointers);
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index ee9ddcfd82b..010727ff4de 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -70,6 +70,7 @@ struct vc4_shader_uniform_info {
         enum quniform_contents *contents;
         uint32_t *data;
         uint32_t count;
+        uint32_t num_texture_samples;
 };
 
 struct vc4_compiled_shader {
@@ -120,6 +121,7 @@ struct vc4_context {
         struct vc4_cl bcl;
         struct vc4_cl rcl;
         struct vc4_cl shader_rec;
+        struct vc4_cl uniforms;
         struct vc4_cl bo_handles;
 #ifdef USE_VC4_SIMULATOR
         struct vc4_cl bo_pointers;
@@ -195,12 +197,11 @@ int vc4_simulator_flush(struct vc4_context *vc4,
                         struct drm_vc4_submit_cl *args,
                         struct vc4_surface *color_surf);
 
-void vc4_get_uniform_bo(struct vc4_context *vc4,
+void vc4_write_uniforms(struct vc4_context *vc4,
                         struct vc4_compiled_shader *shader,
                         struct vc4_constbuf_stateobj *cb,
                         struct vc4_texture_stateobj *texstate,
-                        int shader_index, struct vc4_bo **out_bo,
-                        uint32_t *out_offset);
+                        int shader_index);
 
 void vc4_flush(struct pipe_context *pctx);
 void vc4_emit_state(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index d5628d0d3ca..8559bf3b2fe 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -162,40 +162,38 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
 // Shader Record
 
-        struct vc4_bo *fs_ubo, *vs_ubo, *cs_ubo;
-        uint32_t fs_ubo_offset, vs_ubo_offset, cs_ubo_offset;
-        vc4_get_uniform_bo(vc4, vc4->prog.fs,
+        vc4_write_uniforms(vc4, vc4->prog.fs,
                            &vc4->constbuf[PIPE_SHADER_FRAGMENT],
                            &vc4->fragtex,
-                           0, &fs_ubo, &fs_ubo_offset);
-        vc4_get_uniform_bo(vc4, vc4->prog.vs,
+                           0);
+        vc4_write_uniforms(vc4, vc4->prog.vs,
                            &vc4->constbuf[PIPE_SHADER_VERTEX],
                            &vc4->verttex,
-                           0, &vs_ubo, &vs_ubo_offset);
-        vc4_get_uniform_bo(vc4, vc4->prog.vs,
+                           0);
+        vc4_write_uniforms(vc4, vc4->prog.vs,
                            &vc4->constbuf[PIPE_SHADER_VERTEX],
                            &vc4->verttex,
-                           1, &cs_ubo, &cs_ubo_offset);
+                           1);
 
-        cl_start_shader_reloc(&vc4->shader_rec, 6 + vtx->num_elements);
+        cl_start_shader_reloc(&vc4->shader_rec, 3 + vtx->num_elements);
         cl_u16(&vc4->shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING);
         cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */
         cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs);
         cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0);
-        cl_reloc(vc4, &vc4->shader_rec, fs_ubo, fs_ubo_offset);
+        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
 
         cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */
         cl_u8(&vc4->shader_rec, (1 << vtx->num_elements) - 1); /* vs attribute array bitfield */
         cl_u8(&vc4->shader_rec, 16 * vtx->num_elements); /* vs total attribute size */
         cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0);
-        cl_reloc(vc4, &vc4->shader_rec, vs_ubo, vs_ubo_offset);
+        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
 
         cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */
         cl_u8(&vc4->shader_rec, (1 << vtx->num_elements) - 1); /* cs attribute array bitfield */
         cl_u8(&vc4->shader_rec, 16 * vtx->num_elements); /* vs total attribute size */
         cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo,
                 vc4->prog.vs->coord_shader_offset);
-        cl_reloc(vc4, &vc4->shader_rec, cs_ubo, cs_ubo_offset);
+        cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */
 
         for (int i = 0; i < vtx->num_elements; i++) {
                 struct pipe_vertex_element *elem = &vtx->pipe[i];
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index b958f1d03d0..cc4c735d881 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -74,6 +74,21 @@ struct drm_vc4_submit_cl {
 	 */
 	void __user *shader_records;
 
+	/* Pointer to uniform data and texture handles for the textures
+	 * referenced by the shader.
+	 *
+	 * For each shader state record, there is a set of uniform data in the
+	 * order referenced by the record (FS, VS, then CS).  Each set of
+	 * uniform data has a uint32_t index into bo_handles per texture
+	 * sample operation, in the order the QPU_W_TMUn_S writes appear in
+	 * the program.  Following the texture BO handle indices is the actual
+	 * uniform data.
+	 *
+	 * The individual uniform state blocks don't have sizes passed in,
+	 * because the kernel has to determine the sizes anyway during shader
+	 * code validation.
+	 */
+	void __user *uniforms;
 	void __user *bo_handles;
 
 	/* Size in bytes of the binner command list. */
@@ -84,11 +99,13 @@ struct drm_vc4_submit_cl {
 	uint32_t shader_record_len;
 	/* Number of shader records.
 	 *
-	 * This could just be computed from the contents of shader_records,
-	 * but it keeps the kernel from having to resize various allocations
-	 * it makes.
+	 * This could just be computed from the contents of shader_records and
+	 * the address bits of references to them from the bin CL, but it
+	 * keeps the kernel from having to resize some allocations it makes.
 	 */
 	uint32_t shader_record_count;
+	/** Size in bytes of the uniform state. */
+	uint32_t uniforms_len;
 
 	/* Number of BO handles passed in (size is that times 4). */
 	uint32_t bo_handle_count;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 15e1ff25b04..b7ed1bf60a0 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -57,6 +57,7 @@ struct tgsi_to_qir {
         enum quniform_contents *uniform_contents;
         uint32_t num_uniforms;
         uint32_t num_outputs;
+        uint32_t num_texture_samples;
 };
 
 struct vc4_key {
@@ -332,6 +333,7 @@ tgsi_to_qir_tex(struct tgsi_to_qir *trans,
                 qir_TEX_S(c, s, sampler_p1);
         }
 
+        trans->num_texture_samples++;
         qir_emit(c, qir_inst(QOP_TEX_RESULT, c->undef, c->undef, c->undef));
 
         for (int i = 0; i < 4; i++) {
@@ -938,6 +940,7 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader,
         uinfo->contents = malloc(count * sizeof(*uinfo->contents));
         memcpy(uinfo->contents, trans->uniform_contents,
                count * sizeof(*uinfo->contents));
+        uinfo->num_texture_samples = trans->num_texture_samples;
 }
 
 static void
@@ -1141,26 +1144,23 @@ static uint32_t translate_wrap(uint32_t p_wrap)
         }
 }
 
-static uint32_t
-get_texture_p0(struct vc4_texture_stateobj *texstate,
-               uint32_t tex_and_sampler)
+static void
+write_texture_p0(struct vc4_context *vc4,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t tex_and_sampler)
 {
         uint32_t texi = (tex_and_sampler >> 0) & 0xff;
         struct pipe_sampler_view *texture = texstate->textures[texi];
         struct vc4_resource *rsc = vc4_resource(texture->texture);
 
-        return (texture->u.tex.last_level |
-#if USE_VC4_SIMULATOR
-                simpenrose_hw_addr(rsc->bo->map) /* XXX */
-#else
-                0 /* XXX */
-#endif
-                /* XXX: data type */);
+        cl_reloc(vc4, &vc4->uniforms, rsc->bo,
+                 texture->u.tex.last_level);
 }
 
-static uint32_t
-get_texture_p1(struct vc4_texture_stateobj *texstate,
-               uint32_t tex_and_sampler)
+static void
+write_texture_p1(struct vc4_context *vc4,
+                 struct vc4_texture_stateobj *texstate,
+                 uint32_t tex_and_sampler)
 {
         uint32_t texi = (tex_and_sampler >> 0) & 0xff;
         uint32_t sampi = (tex_and_sampler >> 8) & 0xff;
@@ -1176,14 +1176,15 @@ get_texture_p1(struct vc4_texture_stateobj *texstate,
                 [PIPE_TEX_FILTER_LINEAR] = 0,
         };
 
-        return ((1 << 31) /* XXX: data type */|
-                (texture->texture->height0 << 20) |
-                (texture->texture->width0 << 8) |
-                (imgfilter_map[sampler->mag_img_filter] << 7) |
-                ((imgfilter_map[sampler->min_img_filter] +
-                  mipfilter_map[sampler->min_mip_filter]) << 4) |
-                (translate_wrap(sampler->wrap_t) << 2) |
-                (translate_wrap(sampler->wrap_s) << 0));
+        cl_u32(&vc4->uniforms,
+               (1 << 31) /* XXX: data type */|
+               (texture->texture->height0 << 20) |
+               (texture->texture->width0 << 8) |
+               (imgfilter_map[sampler->mag_img_filter] << 7) |
+               ((imgfilter_map[sampler->min_img_filter] +
+                 mipfilter_map[sampler->min_mip_filter]) << 4) |
+               (translate_wrap(sampler->wrap_t) << 2) |
+               (translate_wrap(sampler->wrap_s) << 0));
 }
 
 static uint32_t
@@ -1203,56 +1204,57 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate,
 }
 
 void
-vc4_get_uniform_bo(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
+vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                    struct vc4_constbuf_stateobj *cb,
                    struct vc4_texture_stateobj *texstate,
-                   int shader_index, struct vc4_bo **out_bo,
-                   uint32_t *out_offset)
+                   int shader_index)
 {
         struct vc4_shader_uniform_info *uinfo = &shader->uniforms[shader_index];
-        struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen,
-                                          MAX2(1, uinfo->count * 4), "ubo");
-        uint32_t *map = vc4_bo_map(ubo);
+        const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
+
+        cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples);
 
         for (int i = 0; i < uinfo->count; i++) {
 
                 switch (uinfo->contents[i]) {
                 case QUNIFORM_CONSTANT:
-                        map[i] = uinfo->data[i];
+                        cl_u32(&vc4->uniforms, uinfo->data[i]);
                         break;
                 case QUNIFORM_UNIFORM:
-                        map[i] = ((uint32_t *)cb->cb[0].user_buffer)[uinfo->data[i]];
+                        cl_u32(&vc4->uniforms,
+                               gallium_uniforms[uinfo->data[i]]);
                         break;
                 case QUNIFORM_VIEWPORT_X_SCALE:
-                        map[i] = fui(vc4->framebuffer.width * 16.0f / 2.0f);
+                        cl_u32(&vc4->uniforms, fui(vc4->framebuffer.width *
+                                                   16.0f / 2.0f));
                         break;
                 case QUNIFORM_VIEWPORT_Y_SCALE:
-                        map[i] = fui(vc4->framebuffer.height * -16.0f / 2.0f);
+                        cl_u32(&vc4->uniforms, fui(vc4->framebuffer.height *
+                                                   -16.0f / 2.0f));
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P0:
-                        map[i] = get_texture_p0(texstate, uinfo->data[i]);
+                        write_texture_p0(vc4, texstate, uinfo->data[i]);
                         break;
 
                 case QUNIFORM_TEXTURE_CONFIG_P1:
-                        map[i] = get_texture_p1(texstate, uinfo->data[i]);
+                        write_texture_p1(vc4, texstate, uinfo->data[i]);
                         break;
 
                 case QUNIFORM_TEXRECT_SCALE_X:
                 case QUNIFORM_TEXRECT_SCALE_Y:
-                        map[i] = get_texrect_scale(texstate,
-                                                   uinfo->contents[i],
-                                                   uinfo->data[i]);
+                        cl_u32(&vc4->uniforms,
+                               get_texrect_scale(texstate,
+                                                 uinfo->contents[i],
+                                                 uinfo->data[i]));
                         break;
                 }
 #if 0
+                uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4);
                 fprintf(stderr, "%p/%d: %d: 0x%08x (%f)\n",
-                        shader, shader_index, i, map[i], uif(map[i]));
+                        shader, shader_index, i, written_val, uif(written_val));
 #endif
         }
-
-        *out_bo = ubo;
-        *out_offset = 0;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 2b59aa53f5a..0dada687911 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -63,9 +63,9 @@ drm_gem_cma_create(struct drm_device *dev, size_t size)
 }
 
 static int
-vc4_simulator_pin_bos(struct drm_device *dev, struct drm_vc4_submit_cl *args,
-                      struct exec_info *exec)
+vc4_simulator_pin_bos(struct drm_device *dev, struct exec_info *exec)
 {
+        struct drm_vc4_submit_cl *args = exec->args;
         struct vc4_context *vc4 = dev->vc4;
         struct vc4_bo **bos = vc4->bo_pointers.base;
 
@@ -84,8 +84,7 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 }
 
 static int
-vc4_simulator_unpin_bos(struct drm_vc4_submit_cl *args,
-                        struct exec_info *exec)
+vc4_simulator_unpin_bos(struct exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
                 struct drm_gem_cma_object *obj = exec->bo[i];
@@ -102,9 +101,9 @@ vc4_simulator_unpin_bos(struct drm_vc4_submit_cl *args,
 }
 
 static int
-vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
-		struct exec_info *exec)
+vc4_cl_validate(struct drm_device *dev, struct exec_info *exec)
 {
+	struct drm_vc4_submit_cl *args = exec->args;
 	void *temp = NULL;
 	void *bin, *render, *shader_rec;
 	int ret = 0;
@@ -112,12 +111,14 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 	uint32_t render_offset = bin_offset + args->bin_cl_len;
 	uint32_t shader_rec_offset = roundup(render_offset +
 					     args->render_cl_len, 16);
-	uint32_t exec_size = shader_rec_offset + args->shader_record_len;
+	uint32_t uniforms_offset = shader_rec_offset + args->shader_record_len;
+	uint32_t exec_size = uniforms_offset + args->uniforms_len;
 	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
 					  args->shader_record_count);
 
 	if (shader_rec_offset < render_offset ||
-	    exec_size < shader_rec_offset ||
+	    uniforms_offset < shader_rec_offset ||
+	    exec_size < uniforms_offset ||
 	    args->shader_record_count >= (UINT_MAX /
 					  sizeof(struct vc4_shader_state)) ||
 	    temp_size < exec_size) {
@@ -142,6 +143,7 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 	bin = temp + bin_offset;
 	render = temp + render_offset;
 	shader_rec = temp + shader_rec_offset;
+	exec->uniforms_u = temp + uniforms_offset;
 	exec->shader_state = temp + exec_size;
 	exec->shader_state_size = args->shader_record_count;
 
@@ -164,6 +166,13 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 		goto fail;
 	}
 
+	ret = copy_from_user(exec->uniforms_u, args->uniforms,
+			     args->uniforms_len);
+	if (ret) {
+		DRM_ERROR("Failed to copy in uniforms cl\n");
+		goto fail;
+	}
+
 	exec->exec_bo = drm_gem_cma_create(dev, exec_size);
 #if 0
 	if (IS_ERR(exec->exec_bo)) {
@@ -180,6 +189,10 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args,
 	exec->ct1ea = exec->ct1ca + args->render_cl_len;
 	exec->shader_paddr = exec->exec_bo->paddr + shader_rec_offset;
 
+	exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset;
+	exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
+	exec->uniforms_size = args->uniforms_len;
+
 	ret = vc4_validate_cl(dev,
 			      exec->exec_bo->vaddr + bin_offset,
 			      bin,
@@ -243,18 +256,20 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args,
                 }
         }
 
-        ret = vc4_simulator_pin_bos(dev, args, &exec);
+        exec.args = args;
+
+        ret = vc4_simulator_pin_bos(dev, &exec);
         if (ret)
                 return ret;
 
-        ret = vc4_cl_validate(dev, args, &exec);
+        ret = vc4_cl_validate(dev, &exec);
         if (ret)
                 return ret;
 
         simpenrose_do_binning(exec.ct0ca, exec.ct0ea);
         simpenrose_do_rendering(exec.ct1ca, exec.ct1ea);
 
-        ret = vc4_simulator_unpin_bos(args, &exec);
+        ret = vc4_simulator_unpin_bos(&exec);
         if (ret)
                 return ret;
 
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.c b/src/gallium/drivers/vc4/vc4_simulator_validate.c
index 14701b171c7..a67e2345b11 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.c
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.c
@@ -347,6 +347,30 @@ vc4_validate_cl(struct drm_device *dev,
 	return 0;
 }
 
+static bool
+reloc_tex(struct exec_info *exec,
+	  void *uniform_data_u,
+	  struct vc4_texture_sample_info *sample,
+	  uint32_t texture_handle_index)
+
+{
+	struct drm_gem_cma_object *tex;
+	uint32_t unvalidated_p0 = *(uint32_t *)(uniform_data_u +
+						sample->p_offset[0]);
+	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
+
+	if (texture_handle_index >= exec->bo_count) {
+		DRM_ERROR("texture handle index %d >= %d\n",
+			  texture_handle_index, exec->bo_count);
+		return false;
+	}
+	tex = exec->bo[texture_handle_index];
+
+	*validated_p0 = tex->paddr + unvalidated_p0;
+
+	return true;
+}
+
 static int
 validate_shader_rec(struct drm_device *dev,
 		    struct exec_info *exec,
@@ -358,45 +382,54 @@ validate_shader_rec(struct drm_device *dev,
 	uint32_t *src_handles = unvalidated;
 	void *src_pkt;
 	void *dst_pkt = validated;
-	static const int gl_bo_offsets[] = {
-		4, 8, /* fs code, ubo */
-		16, 20, /* vs code, ubo */
-		28, 32, /* cs code, ubo */
+	enum shader_rec_reloc_type {
+		RELOC_CODE,
+		RELOC_VBO,
+	};
+	struct shader_rec_reloc {
+		enum shader_rec_reloc_type type;
+		uint32_t offset;
+	};
+	static const struct shader_rec_reloc gl_relocs[] = {
+		{ RELOC_CODE, 4 },  /* fs */
+		{ RELOC_CODE, 16 }, /* vs */
+		{ RELOC_CODE, 28 }, /* cs */
 	};
-	static const int nv_bo_offsets[] = {
-		4, 8, /* fs code, ubo */
-		12, /* vbo */
+	static const struct shader_rec_reloc nv_relocs[] = {
+		{ RELOC_CODE, 4 }, /* fs */
+		{ RELOC_VBO, 12 }
 	};
-	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_bo_offsets) + 8];
-	const int *bo_offsets;
-	uint32_t nr_attributes = 0, nr_bo, packet_size;
+	const struct shader_rec_reloc *relocs;
+	struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8];
+	uint32_t nr_attributes = 0, nr_relocs, packet_size;
 	int i;
+	struct vc4_validated_shader_info *validated_shader = NULL;
 
 	if (state->packet == VC4_PACKET_NV_SHADER_STATE) {
-		bo_offsets = nv_bo_offsets;
-		nr_bo = ARRAY_SIZE(nv_bo_offsets);
+		relocs = nv_relocs;
+		nr_relocs = ARRAY_SIZE(nv_relocs);
 
 		packet_size = 16;
 	} else {
-		bo_offsets = gl_bo_offsets;
-		nr_bo = ARRAY_SIZE(gl_bo_offsets);
+		relocs = gl_relocs;
+		nr_relocs = ARRAY_SIZE(gl_relocs);
 
 		nr_attributes = state->addr & 0x7;
 		if (nr_attributes == 0)
 			nr_attributes = 8;
 		packet_size = 36 + nr_attributes * 8;
 	}
-	if ((nr_bo + nr_attributes) * 4 + packet_size > len) {
+	if ((nr_relocs + nr_attributes) * 4 + packet_size > len) {
 		DRM_ERROR("overflowed shader packet read "
 			  "(handles %d, packet %d, len %d)\n",
-			  (nr_bo + nr_attributes) * 4, packet_size, len);
+			  (nr_relocs + nr_attributes) * 4, packet_size, len);
 		return -EINVAL;
 	}
 
-	src_pkt = unvalidated + 4 * (nr_bo + nr_attributes);
+	src_pkt = unvalidated + 4 * (nr_relocs + nr_attributes);
 	memcpy(dst_pkt, src_pkt, packet_size);
 
-	for (i = 0; i < nr_bo + nr_attributes; i++) {
+	for (i = 0; i < nr_relocs + nr_attributes; i++) {
 		if (src_handles[i] >= exec->bo_count) {
 			DRM_ERROR("shader rec bo index %d > %d\n",
 				  src_handles[i], exec->bo_count);
@@ -405,21 +438,73 @@ validate_shader_rec(struct drm_device *dev,
 		bo[i] = exec->bo[src_handles[i]];
 	}
 
-	for (i = 0; i < nr_bo; i++) {
-		/* XXX: validation */
-		uint32_t o = bo_offsets[i];
-		*(uint32_t *)(dst_pkt + o) =
-			bo[i]->paddr + *(uint32_t *)(src_pkt + o);
+	for (i = 0; i < nr_relocs; i++) {
+		uint32_t o = relocs[i].offset;
+		uint32_t src_offset = *(uint32_t *)(src_pkt + o);
+		*(uint32_t *)(dst_pkt + o) = bo[i]->paddr + src_offset;
+		uint32_t *texture_handles_u;
+		void *uniform_data_u;
+		uint32_t tex;
+
+		switch (relocs[i].type) {
+		case RELOC_CODE:
+			kfree(validated_shader);
+			validated_shader = vc4_validate_shader(bo[i],
+							       src_offset);
+			if (!validated_shader)
+				goto fail;
+
+			if (validated_shader->uniforms_src_size >
+			    exec->uniforms_size) {
+				DRM_ERROR("Uniforms src buffer overflow\n");
+				goto fail;
+			}
+
+			texture_handles_u = exec->uniforms_u;
+			uniform_data_u = (texture_handles_u +
+					  validated_shader->num_texture_samples);
+
+			memcpy(exec->uniforms_v, uniform_data_u,
+			       validated_shader->uniforms_size);
+
+			for (tex = 0;
+			     tex < validated_shader->num_texture_samples;
+			     tex++) {
+				if (!reloc_tex(exec,
+					       uniform_data_u,
+					       &validated_shader->texture_samples[tex],
+					       texture_handles_u[tex])) {
+					goto fail;
+				}
+			}
+
+			*(uint32_t *)(dst_pkt + o + 4) = exec->uniforms_p;
+
+			exec->uniforms_u += validated_shader->uniforms_src_size;
+			exec->uniforms_v += validated_shader->uniforms_size;
+			exec->uniforms_p += validated_shader->uniforms_size;
+
+			break;
+
+		case RELOC_VBO:
+			break;
+		}
 	}
 
 	for (i = 0; i < nr_attributes; i++) {
 		/* XXX: validation */
 		uint32_t o = 36 + i * 8;
 		*(uint32_t *)(dst_pkt + o) =
-			bo[nr_bo + i]->paddr + *(uint32_t *)(src_pkt + o);
+			bo[nr_relocs + i]->paddr + *(uint32_t *)(src_pkt + o);
 	}
 
+	kfree(validated_shader);
+
 	return 0;
+
+fail:
+	kfree(validated_shader);
+	return -EINVAL;
 }
 
 int
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 4a2a2181ab4..885a754a9d5 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -26,15 +26,20 @@
 
 #include <stdbool.h>
 #include <string.h>
+#include <stdlib.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <errno.h>
 
+#include "vc4_context.h"
+#include "vc4_qpu_defines.h"
+
 #define DRM_INFO(...) fprintf(stderr, __VA_ARGS__)
 #define DRM_ERROR(...) fprintf(stderr, __VA_ARGS__)
 #define kmalloc(size, arg) malloc(size)
+#define kcalloc(size, count, arg) calloc(size, count)
 #define kfree(ptr) free(ptr)
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define krealloc(ptr, size, args) realloc(ptr, size)
 #define roundup(x, y) align(x, y)
 
 static inline int
@@ -64,6 +69,9 @@ struct drm_gem_cma_object {
 };
 
 struct exec_info {
+	/* Kernel-space copy of the ioctl arguments */
+	struct drm_vc4_submit_cl *args;
+
 	/* This is the array of BOs that were looked up at the start of exec.
 	 * Command validation will use indices into this array.
 	 */
@@ -79,9 +87,8 @@ struct exec_info {
 	uint32_t bo_index[2];
 	uint32_t max_width, max_height;
 
-	/**
-	 * This is the BO where we store the validated command lists
-	 * and shader records.
+	/* This is the BO where we store the validated command lists, shader
+	 * records, and uniforms.
 	 */
 	struct drm_gem_cma_object *exec_bo;
 
@@ -108,6 +115,50 @@ struct exec_info {
 	uint32_t ct0ca, ct0ea;
 	uint32_t ct1ca, ct1ea;
 	uint32_t shader_paddr;
+
+	/* Pointers to the uniform data.  These pointers are incremented, and
+	 * size decremented, as each batch of uniforms is uploaded.
+	 */
+	void *uniforms_u;
+	void *uniforms_v;
+	uint32_t uniforms_p;
+	uint32_t uniforms_size;
+};
+
+/**
+ * struct vc4_texture_sample_info - saves the offsets into the UBO for texture
+ * setup parameters.
+ *
+ * This will be used at draw time to relocate the reference to the texture
+ * contents in p0, and validate that the offset combined with
+ * width/height/stride/etc. from p1 and p2/p3 doesn't sample outside the BO.
+ * Note that the hardware treats unprovided config parameters as 0, so not all
+ * of them need to be set up for every texure sample, and we'll store ~0 as
+ * the offset to mark the unused ones.
+ *
+ * See the VC4 3D architecture guide page 41 ("Texture and Memory Lookup Unit
+ * Setup") for definitions of the texture parameters.
+ */
+struct vc4_texture_sample_info {
+	uint32_t p_offset[4];
+};
+
+/**
+ * struct vc4_validated_shader_info - information about validated shaders that
+ * needs to be used from command list validation.
+ *
+ * For a given shader, each time a shader state record references it, we need
+ * to verify that the shader doesn't read more uniforms than the shader state
+ * record's uniform BO pointer can provide, and we need to apply relocations
+ * and validate the shader state record's uniforms that define the texture
+ * samples.
+ */
+struct vc4_validated_shader_info
+{
+	uint32_t uniforms_size;
+	uint32_t uniforms_src_size;
+	uint32_t num_texture_samples;
+	struct vc4_texture_sample_info *texture_samples;
 };
 
 int vc4_validate_cl(struct drm_device *dev,
@@ -123,4 +174,8 @@ int vc4_validate_shader_recs(struct drm_device *dev,
                              uint32_t len,
                              struct exec_info *exec);
 
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
+                    uint32_t start_offset);
+
 #endif /* VC4_SIMULATOR_VALIDATE_H */
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate_shaders.c b/src/gallium/drivers/vc4/vc4_simulator_validate_shaders.c
new file mode 100644
index 00000000000..c02deb406c7
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate_shaders.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright Â© 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Shader validator for VC4.
+ *
+ * The VC4 has no IOMMU between it and system memory.  So, a user with access
+ * to execute shaders could escalate privilege by overwriting system memory
+ * (using the VPM write address register in the general-purpose DMA mode) or
+ * reading system memory it shouldn't (reading it as a texture, or uniform
+ * data, or vertex data).
+ *
+ * This walks over a shader starting from some offset within a BO, ensuring
+ * that its accesses are appropriately bounded, and recording how many texture
+ * accesses are made and where so that we can do relocations for them in the
+ * uniform stream.
+ *
+ * The kernel API has shaders stored in user-mapped BOs.  The BOs will be
+ * forcibly unmapped from the process before validation, and any cache of
+ * validated state will be flushed if the mapping is faulted back in.
+ *
+ * Storing the shaders in BOs means that the validation process will be slow
+ * due to uncached reads, but since shaders are long-lived and shader BOs are
+ * never actually modified, this shouldn't be a problem.
+ */
+
+#include "vc4_simulator_validate.h"
+#include "vc4_qpu.h"
+#include "vc4_qpu_defines.h"
+
+struct vc4_shader_validation_state {
+	struct vc4_texture_sample_info tmu_setup[2];
+	int tmu_write_count[2];
+};
+
+static bool
+is_tmu_write(uint32_t waddr)
+{
+	return (waddr >= QPU_W_TMU0_S &&
+		waddr <= QPU_W_TMU1_B);
+}
+
+static bool
+check_register_write(uint32_t waddr, bool is_b)
+{
+	switch (waddr) {
+	case QPU_W_UNIFORMS_ADDRESS:
+		/* XXX: We'll probably need to support this for reladdr, but
+		 * it's definitely a security-related one.
+		 */
+		DRM_ERROR("uniforms address load unsupported\n");
+		return false;
+
+	case QPU_W_TLB_COLOR_MS:
+	case QPU_W_TLB_COLOR_ALL:
+	case QPU_W_TLB_Z:
+		/* XXX: We need to track which buffers get written by the
+		 * shader, to make sure that we have those buffers set up by
+		 * the config packets.  But we need to pass them for now to
+		 * get things up and running.
+		 */
+		return true;
+
+	case QPU_W_TMU0_S:
+	case QPU_W_TMU0_T:
+	case QPU_W_TMU0_R:
+	case QPU_W_TMU0_B:
+	case QPU_W_TMU1_S:
+	case QPU_W_TMU1_T:
+	case QPU_W_TMU1_R:
+	case QPU_W_TMU1_B:
+		/* XXX: We need to track where the uniforms get loaded for
+		 * texturing so that we can do relocations, and to validate
+		 * those uniform contents.
+		 */
+		return true;
+
+	case QPU_W_HOST_INT:
+	case QPU_W_TMU_NOSWAP:
+	case QPU_W_TLB_STENCIL_SETUP:
+	case QPU_W_TLB_ALPHA_MASK:
+	case QPU_W_MUTEX_RELEASE:
+		/* XXX: I haven't thought about these, so don't support them
+		 * for now.
+		 */
+		DRM_ERROR("Unsupported waddr %d\n", waddr);
+		return false;
+
+	case QPU_W_VPM_ADDR:
+		DRM_ERROR("General VPM DMA unsupported\n");
+		return false;
+
+	case QPU_W_VPM:
+	case QPU_W_VPMVCD_SETUP:
+		/* We allow VPM setup in general, even including VPM DMA
+		 * configuration setup, because the (unsafe) DMA can only be
+		 * triggered by QPU_W_VPM_ADDR writes.
+		 */
+		return true;
+	}
+
+	return true;
+}
+
+static bool
+record_validated_texture_sample(struct vc4_validated_shader_info *validated_shader,
+				struct vc4_shader_validation_state *validation_state,
+				int tmu)
+{
+	uint32_t s = validated_shader->num_texture_samples;
+	int i;
+	struct vc4_texture_sample_info *temp_samples;
+
+	temp_samples = krealloc(validated_shader->texture_samples,
+				(s + 1) * sizeof(*temp_samples),
+				GFP_KERNEL);
+	if (!temp_samples)
+		return false;
+
+	memcpy(temp_samples[s].p_offset,
+	       validation_state->tmu_setup[tmu].p_offset,
+	       validation_state->tmu_write_count[tmu] * sizeof(uint32_t));
+	for (i = validation_state->tmu_write_count[tmu]; i < 4; i++)
+		temp_samples[s].p_offset[i] = ~0;
+
+	validated_shader->num_texture_samples = s + 1;
+	validated_shader->texture_samples = temp_samples;
+
+	return true;
+}
+
+static bool
+check_tmu_writes(uint64_t inst,
+		 struct vc4_validated_shader_info *validated_shader,
+		 struct vc4_shader_validation_state *validation_state,
+		 uint32_t waddr)
+{
+	int tmu = waddr > QPU_W_TMU0_B;
+
+	if (!is_tmu_write(waddr))
+		return true;
+
+	if (validation_state->tmu_write_count[tmu] >= 4) {
+		DRM_ERROR("TMU%d got too many parameters before dispatch\n",
+			  tmu);
+		return false;
+	}
+	validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
+		validated_shader->uniforms_size;
+	validation_state->tmu_write_count[tmu]++;
+	validated_shader->uniforms_size += 4;
+
+	if (waddr == QPU_W_TMU0_S || waddr == QPU_W_TMU1_S) {
+		if (!record_validated_texture_sample(validated_shader,
+						     validation_state, tmu)) {
+			return false;
+		}
+
+		validation_state->tmu_write_count[tmu] = 0;
+	}
+
+	return true;
+}
+
+static bool
+check_instruction_writes(uint64_t inst,
+			 struct vc4_validated_shader_info *validated_shader,
+			 struct vc4_shader_validation_state *validation_state)
+{
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	bool ws = inst & QPU_WS;
+
+	if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
+		DRM_ERROR("ADD and MUL both set up textures\n");
+		return false;
+	}
+
+	if (!check_tmu_writes(inst, validated_shader, validation_state,
+			      waddr_add)) {
+		return false;
+	}
+
+	if (!check_tmu_writes(inst, validated_shader, validation_state,
+			      waddr_mul)) {
+		return false;
+	}
+
+	return (check_register_write(waddr_add, ws) &&
+		check_register_write(waddr_mul, !ws));
+}
+
+static bool
+check_instruction_reads(uint64_t inst,
+			struct vc4_validated_shader_info *validated_shader)
+{
+	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+
+	if (raddr_a == QPU_R_UNIF ||
+	    raddr_b == QPU_R_UNIF) {
+		if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) {
+			DRM_ERROR("uniform read in the same instruction as "
+				  "texture setup");
+			return false;
+		}
+
+		/* This can't overflow the uint32_t, because we're reading 8
+		 * bytes of instruction to increment by 4 here, so we'd
+		 * already be OOM.
+		 */
+		validated_shader->uniforms_size += 4;
+	}
+
+	return true;
+}
+
+struct vc4_validated_shader_info *
+vc4_validate_shader(struct drm_gem_cma_object *shader_obj,
+		    uint32_t start_offset)
+{
+	bool found_shader_end = false;
+	int shader_end_ip = 0;
+	uint32_t ip, max_ip;
+	uint64_t *shader;
+	struct vc4_validated_shader_info *validated_shader;
+	struct vc4_shader_validation_state validation_state;
+
+	memset(&validation_state, 0, sizeof(validation_state));
+
+	if (start_offset + sizeof(uint64_t) > shader_obj->base.size) {
+		DRM_ERROR("shader starting at %d outside of BO sized %d\n",
+			  start_offset,
+			  shader_obj->base.size);
+		return NULL;
+	}
+	shader = shader_obj->vaddr + start_offset;
+	max_ip = (shader_obj->base.size - start_offset) / sizeof(uint64_t);
+
+	validated_shader = kcalloc(sizeof(*validated_shader), 1, GFP_KERNEL);
+	if (!validated_shader)
+		return NULL;
+
+	for (ip = 0; ip < max_ip; ip++) {
+		uint64_t inst = shader[ip];
+		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+		switch (sig) {
+		case QPU_SIG_NONE:
+		case QPU_SIG_WAIT_FOR_SCOREBOARD:
+		case QPU_SIG_SCOREBOARD_UNLOCK:
+		case QPU_SIG_LOAD_TMU0:
+		case QPU_SIG_LOAD_TMU1:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad write at ip %d\n", ip);
+				goto fail;
+			}
+
+			if (!check_instruction_reads(inst, validated_shader))
+				goto fail;
+
+			break;
+
+		case QPU_SIG_LOAD_IMM:
+			if (!check_instruction_writes(inst, validated_shader,
+						      &validation_state)) {
+				DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip);
+				goto fail;
+			}
+			break;
+
+		case QPU_SIG_PROG_END:
+			found_shader_end = true;
+			shader_end_ip = ip;
+			break;
+
+		default:
+			DRM_ERROR("Unsupported QPU signal %d at "
+				  "instruction %d\n", sig, ip);
+			goto fail;
+		}
+
+		/* There are two delay slots after program end is signaled
+		 * that are still executed, then we're finished.
+		 */
+		if (found_shader_end && ip == shader_end_ip + 2)
+			break;
+	}
+
+	if (ip == max_ip) {
+		DRM_ERROR("shader starting at %d failed to terminate before "
+			  "shader BO end at %d\n",
+			  start_offset,
+			  shader_obj->base.size);
+		goto fail;
+	}
+
+	/* Again, no chance of integer overflow here because the worst case
+	 * scenario is 8 bytes of uniforms plus handles per 8-byte
+	 * instruction.
+	 */
+	validated_shader->uniforms_src_size =
+		(validated_shader->uniforms_size +
+		 4 * validated_shader->num_texture_samples);
+
+	return validated_shader;
+
+fail:
+	kfree(validated_shader);
+	return NULL;
+}