From: Eric Anholt Date: Mon, 21 Jul 2014 18:27:35 +0000 (-0700) Subject: vc4: Rewrite the kernel ABI to support texture uniform relocation. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a8f2bf0f51222a96a49dfb3d6f9b36d3e54d08cd;p=mesa.git vc4: Rewrite the kernel ABI to support texture uniform relocation. This required building a shader parser that would walk the program to find where the texturing-related uniforms are in the uniforms stream. Note that as of this commit, a new kernel is required for rendering on actual VC4 hardware (currently that commit is named "drm/vc4: Introduce shader validation and better command stream validation.", but is likely to be squashed as part of an eventual merge of the kernel driver). --- diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index ee351835896..414a64ab472 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -17,5 +17,6 @@ C_SOURCES := \ vc4_screen.c \ vc4_simulator.c \ vc4_simulator_validate.c \ + vc4_simulator_validate_shaders.c \ vc4_state.c \ $() diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index a9fa7ef70f1..08e85ed6312 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -107,6 +107,8 @@ vc4_flush(struct pipe_context *pctx) submit.shader_records = vc4->shader_rec.base; submit.shader_record_len = vc4->shader_rec.next - vc4->shader_rec.base; submit.shader_record_count = vc4->shader_rec_count; + submit.uniforms = vc4->uniforms.base; + submit.uniforms_len = vc4->uniforms.next - vc4->uniforms.base; if (!(vc4_debug & VC4_DEBUG_NORAST)) { int ret; @@ -123,6 +125,7 @@ vc4_flush(struct pipe_context *pctx) vc4_reset_cl(&vc4->bcl); vc4_reset_cl(&vc4->rcl); vc4_reset_cl(&vc4->shader_rec); + vc4_reset_cl(&vc4->uniforms); vc4_reset_cl(&vc4->bo_handles); #ifdef USE_VC4_SIMULATOR vc4_reset_cl(&vc4->bo_pointers); diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index ee9ddcfd82b..010727ff4de 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -70,6 +70,7 @@ struct vc4_shader_uniform_info { enum quniform_contents *contents; uint32_t *data; uint32_t count; + uint32_t num_texture_samples; }; struct vc4_compiled_shader { @@ -120,6 +121,7 @@ struct vc4_context { struct vc4_cl bcl; struct vc4_cl rcl; struct vc4_cl shader_rec; + struct vc4_cl uniforms; struct vc4_cl bo_handles; #ifdef USE_VC4_SIMULATOR struct vc4_cl bo_pointers; @@ -195,12 +197,11 @@ int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args, struct vc4_surface *color_surf); -void vc4_get_uniform_bo(struct vc4_context *vc4, +void vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, struct vc4_texture_stateobj *texstate, - int shader_index, struct vc4_bo **out_bo, - uint32_t *out_offset); + int shader_index); void vc4_flush(struct pipe_context *pctx); void vc4_emit_state(struct pipe_context *pctx); diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index d5628d0d3ca..8559bf3b2fe 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -162,40 +162,38 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) // Shader Record - struct vc4_bo *fs_ubo, *vs_ubo, *cs_ubo; - uint32_t fs_ubo_offset, vs_ubo_offset, cs_ubo_offset; - vc4_get_uniform_bo(vc4, vc4->prog.fs, + vc4_write_uniforms(vc4, vc4->prog.fs, &vc4->constbuf[PIPE_SHADER_FRAGMENT], &vc4->fragtex, - 0, &fs_ubo, &fs_ubo_offset); - vc4_get_uniform_bo(vc4, vc4->prog.vs, + 0); + vc4_write_uniforms(vc4, vc4->prog.vs, &vc4->constbuf[PIPE_SHADER_VERTEX], &vc4->verttex, - 0, &vs_ubo, &vs_ubo_offset); - vc4_get_uniform_bo(vc4, vc4->prog.vs, + 0); + vc4_write_uniforms(vc4, vc4->prog.vs, &vc4->constbuf[PIPE_SHADER_VERTEX], &vc4->verttex, - 1, &cs_ubo, &cs_ubo_offset); + 1); - cl_start_shader_reloc(&vc4->shader_rec, 6 + vtx->num_elements); + cl_start_shader_reloc(&vc4->shader_rec, 3 + vtx->num_elements); cl_u16(&vc4->shader_rec, VC4_SHADER_FLAG_ENABLE_CLIPPING); cl_u8(&vc4->shader_rec, 0); /* fs num uniforms (unused) */ cl_u8(&vc4->shader_rec, vc4->prog.fs->num_inputs); cl_reloc(vc4, &vc4->shader_rec, vc4->prog.fs->bo, 0); - cl_reloc(vc4, &vc4->shader_rec, fs_ubo, fs_ubo_offset); + cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ cl_u16(&vc4->shader_rec, 0); /* vs num uniforms */ cl_u8(&vc4->shader_rec, (1 << vtx->num_elements) - 1); /* vs attribute array bitfield */ cl_u8(&vc4->shader_rec, 16 * vtx->num_elements); /* vs total attribute size */ cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, 0); - cl_reloc(vc4, &vc4->shader_rec, vs_ubo, vs_ubo_offset); + cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ cl_u16(&vc4->shader_rec, 0); /* cs num uniforms */ cl_u8(&vc4->shader_rec, (1 << vtx->num_elements) - 1); /* cs attribute array bitfield */ cl_u8(&vc4->shader_rec, 16 * vtx->num_elements); /* vs total attribute size */ cl_reloc(vc4, &vc4->shader_rec, vc4->prog.vs->bo, vc4->prog.vs->coord_shader_offset); - cl_reloc(vc4, &vc4->shader_rec, cs_ubo, cs_ubo_offset); + cl_u32(&vc4->shader_rec, 0); /* UBO offset written by kernel */ for (int i = 0; i < vtx->num_elements; i++) { struct pipe_vertex_element *elem = &vtx->pipe[i]; diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h index b958f1d03d0..cc4c735d881 100644 --- a/src/gallium/drivers/vc4/vc4_drm.h +++ b/src/gallium/drivers/vc4/vc4_drm.h @@ -74,6 +74,21 @@ struct drm_vc4_submit_cl { */ void __user *shader_records; + /* Pointer to uniform data and texture handles for the textures + * referenced by the shader. + * + * For each shader state record, there is a set of uniform data in the + * order referenced by the record (FS, VS, then CS). Each set of + * uniform data has a uint32_t index into bo_handles per texture + * sample operation, in the order the QPU_W_TMUn_S writes appear in + * the program. Following the texture BO handle indices is the actual + * uniform data. + * + * The individual uniform state blocks don't have sizes passed in, + * because the kernel has to determine the sizes anyway during shader + * code validation. + */ + void __user *uniforms; void __user *bo_handles; /* Size in bytes of the binner command list. */ @@ -84,11 +99,13 @@ struct drm_vc4_submit_cl { uint32_t shader_record_len; /* Number of shader records. * - * This could just be computed from the contents of shader_records, - * but it keeps the kernel from having to resize various allocations - * it makes. + * This could just be computed from the contents of shader_records and + * the address bits of references to them from the bin CL, but it + * keeps the kernel from having to resize some allocations it makes. */ uint32_t shader_record_count; + /** Size in bytes of the uniform state. */ + uint32_t uniforms_len; /* Number of BO handles passed in (size is that times 4). */ uint32_t bo_handle_count; diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 15e1ff25b04..b7ed1bf60a0 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -57,6 +57,7 @@ struct tgsi_to_qir { enum quniform_contents *uniform_contents; uint32_t num_uniforms; uint32_t num_outputs; + uint32_t num_texture_samples; }; struct vc4_key { @@ -332,6 +333,7 @@ tgsi_to_qir_tex(struct tgsi_to_qir *trans, qir_TEX_S(c, s, sampler_p1); } + trans->num_texture_samples++; qir_emit(c, qir_inst(QOP_TEX_RESULT, c->undef, c->undef, c->undef)); for (int i = 0; i < 4; i++) { @@ -938,6 +940,7 @@ copy_uniform_state_to_shader(struct vc4_compiled_shader *shader, uinfo->contents = malloc(count * sizeof(*uinfo->contents)); memcpy(uinfo->contents, trans->uniform_contents, count * sizeof(*uinfo->contents)); + uinfo->num_texture_samples = trans->num_texture_samples; } static void @@ -1141,26 +1144,23 @@ static uint32_t translate_wrap(uint32_t p_wrap) } } -static uint32_t -get_texture_p0(struct vc4_texture_stateobj *texstate, - uint32_t tex_and_sampler) +static void +write_texture_p0(struct vc4_context *vc4, + struct vc4_texture_stateobj *texstate, + uint32_t tex_and_sampler) { uint32_t texi = (tex_and_sampler >> 0) & 0xff; struct pipe_sampler_view *texture = texstate->textures[texi]; struct vc4_resource *rsc = vc4_resource(texture->texture); - return (texture->u.tex.last_level | -#if USE_VC4_SIMULATOR - simpenrose_hw_addr(rsc->bo->map) /* XXX */ -#else - 0 /* XXX */ -#endif - /* XXX: data type */); + cl_reloc(vc4, &vc4->uniforms, rsc->bo, + texture->u.tex.last_level); } -static uint32_t -get_texture_p1(struct vc4_texture_stateobj *texstate, - uint32_t tex_and_sampler) +static void +write_texture_p1(struct vc4_context *vc4, + struct vc4_texture_stateobj *texstate, + uint32_t tex_and_sampler) { uint32_t texi = (tex_and_sampler >> 0) & 0xff; uint32_t sampi = (tex_and_sampler >> 8) & 0xff; @@ -1176,14 +1176,15 @@ get_texture_p1(struct vc4_texture_stateobj *texstate, [PIPE_TEX_FILTER_LINEAR] = 0, }; - return ((1 << 31) /* XXX: data type */| - (texture->texture->height0 << 20) | - (texture->texture->width0 << 8) | - (imgfilter_map[sampler->mag_img_filter] << 7) | - ((imgfilter_map[sampler->min_img_filter] + - mipfilter_map[sampler->min_mip_filter]) << 4) | - (translate_wrap(sampler->wrap_t) << 2) | - (translate_wrap(sampler->wrap_s) << 0)); + cl_u32(&vc4->uniforms, + (1 << 31) /* XXX: data type */| + (texture->texture->height0 << 20) | + (texture->texture->width0 << 8) | + (imgfilter_map[sampler->mag_img_filter] << 7) | + ((imgfilter_map[sampler->min_img_filter] + + mipfilter_map[sampler->min_mip_filter]) << 4) | + (translate_wrap(sampler->wrap_t) << 2) | + (translate_wrap(sampler->wrap_s) << 0)); } static uint32_t @@ -1203,56 +1204,57 @@ get_texrect_scale(struct vc4_texture_stateobj *texstate, } void -vc4_get_uniform_bo(struct vc4_context *vc4, struct vc4_compiled_shader *shader, +vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader, struct vc4_constbuf_stateobj *cb, struct vc4_texture_stateobj *texstate, - int shader_index, struct vc4_bo **out_bo, - uint32_t *out_offset) + int shader_index) { struct vc4_shader_uniform_info *uinfo = &shader->uniforms[shader_index]; - struct vc4_bo *ubo = vc4_bo_alloc(vc4->screen, - MAX2(1, uinfo->count * 4), "ubo"); - uint32_t *map = vc4_bo_map(ubo); + const uint32_t *gallium_uniforms = cb->cb[0].user_buffer; + + cl_start_shader_reloc(&vc4->uniforms, uinfo->num_texture_samples); for (int i = 0; i < uinfo->count; i++) { switch (uinfo->contents[i]) { case QUNIFORM_CONSTANT: - map[i] = uinfo->data[i]; + cl_u32(&vc4->uniforms, uinfo->data[i]); break; case QUNIFORM_UNIFORM: - map[i] = ((uint32_t *)cb->cb[0].user_buffer)[uinfo->data[i]]; + cl_u32(&vc4->uniforms, + gallium_uniforms[uinfo->data[i]]); break; case QUNIFORM_VIEWPORT_X_SCALE: - map[i] = fui(vc4->framebuffer.width * 16.0f / 2.0f); + cl_u32(&vc4->uniforms, fui(vc4->framebuffer.width * + 16.0f / 2.0f)); break; case QUNIFORM_VIEWPORT_Y_SCALE: - map[i] = fui(vc4->framebuffer.height * -16.0f / 2.0f); + cl_u32(&vc4->uniforms, fui(vc4->framebuffer.height * + -16.0f / 2.0f)); break; case QUNIFORM_TEXTURE_CONFIG_P0: - map[i] = get_texture_p0(texstate, uinfo->data[i]); + write_texture_p0(vc4, texstate, uinfo->data[i]); break; case QUNIFORM_TEXTURE_CONFIG_P1: - map[i] = get_texture_p1(texstate, uinfo->data[i]); + write_texture_p1(vc4, texstate, uinfo->data[i]); break; case QUNIFORM_TEXRECT_SCALE_X: case QUNIFORM_TEXRECT_SCALE_Y: - map[i] = get_texrect_scale(texstate, - uinfo->contents[i], - uinfo->data[i]); + cl_u32(&vc4->uniforms, + get_texrect_scale(texstate, + uinfo->contents[i], + uinfo->data[i])); break; } #if 0 + uint32_t written_val = *(uint32_t *)(vc4->uniforms.next - 4); fprintf(stderr, "%p/%d: %d: 0x%08x (%f)\n", - shader, shader_index, i, map[i], uif(map[i])); + shader, shader_index, i, written_val, uif(written_val)); #endif } - - *out_bo = ubo; - *out_offset = 0; } static void diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c index 2b59aa53f5a..0dada687911 100644 --- a/src/gallium/drivers/vc4/vc4_simulator.c +++ b/src/gallium/drivers/vc4/vc4_simulator.c @@ -63,9 +63,9 @@ drm_gem_cma_create(struct drm_device *dev, size_t size) } static int -vc4_simulator_pin_bos(struct drm_device *dev, struct drm_vc4_submit_cl *args, - struct exec_info *exec) +vc4_simulator_pin_bos(struct drm_device *dev, struct exec_info *exec) { + struct drm_vc4_submit_cl *args = exec->args; struct vc4_context *vc4 = dev->vc4; struct vc4_bo **bos = vc4->bo_pointers.base; @@ -84,8 +84,7 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct drm_vc4_submit_cl *args, } static int -vc4_simulator_unpin_bos(struct drm_vc4_submit_cl *args, - struct exec_info *exec) +vc4_simulator_unpin_bos(struct exec_info *exec) { for (int i = 0; i < exec->bo_count; i++) { struct drm_gem_cma_object *obj = exec->bo[i]; @@ -102,9 +101,9 @@ vc4_simulator_unpin_bos(struct drm_vc4_submit_cl *args, } static int -vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args, - struct exec_info *exec) +vc4_cl_validate(struct drm_device *dev, struct exec_info *exec) { + struct drm_vc4_submit_cl *args = exec->args; void *temp = NULL; void *bin, *render, *shader_rec; int ret = 0; @@ -112,12 +111,14 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args, uint32_t render_offset = bin_offset + args->bin_cl_len; uint32_t shader_rec_offset = roundup(render_offset + args->render_cl_len, 16); - uint32_t exec_size = shader_rec_offset + args->shader_record_len; + uint32_t uniforms_offset = shader_rec_offset + args->shader_record_len; + uint32_t exec_size = uniforms_offset + args->uniforms_len; uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) * args->shader_record_count); if (shader_rec_offset < render_offset || - exec_size < shader_rec_offset || + uniforms_offset < shader_rec_offset || + exec_size < uniforms_offset || args->shader_record_count >= (UINT_MAX / sizeof(struct vc4_shader_state)) || temp_size < exec_size) { @@ -142,6 +143,7 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args, bin = temp + bin_offset; render = temp + render_offset; shader_rec = temp + shader_rec_offset; + exec->uniforms_u = temp + uniforms_offset; exec->shader_state = temp + exec_size; exec->shader_state_size = args->shader_record_count; @@ -164,6 +166,13 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args, goto fail; } + ret = copy_from_user(exec->uniforms_u, args->uniforms, + args->uniforms_len); + if (ret) { + DRM_ERROR("Failed to copy in uniforms cl\n"); + goto fail; + } + exec->exec_bo = drm_gem_cma_create(dev, exec_size); #if 0 if (IS_ERR(exec->exec_bo)) { @@ -180,6 +189,10 @@ vc4_cl_validate(struct drm_device *dev, struct drm_vc4_submit_cl *args, exec->ct1ea = exec->ct1ca + args->render_cl_len; exec->shader_paddr = exec->exec_bo->paddr + shader_rec_offset; + exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset; + exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset; + exec->uniforms_size = args->uniforms_len; + ret = vc4_validate_cl(dev, exec->exec_bo->vaddr + bin_offset, bin, @@ -243,18 +256,20 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args, } } - ret = vc4_simulator_pin_bos(dev, args, &exec); + exec.args = args; + + ret = vc4_simulator_pin_bos(dev, &exec); if (ret) return ret; - ret = vc4_cl_validate(dev, args, &exec); + ret = vc4_cl_validate(dev, &exec); if (ret) return ret; simpenrose_do_binning(exec.ct0ca, exec.ct0ea); simpenrose_do_rendering(exec.ct1ca, exec.ct1ea); - ret = vc4_simulator_unpin_bos(args, &exec); + ret = vc4_simulator_unpin_bos(&exec); if (ret) return ret; diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.c b/src/gallium/drivers/vc4/vc4_simulator_validate.c index 14701b171c7..a67e2345b11 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.c +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.c @@ -347,6 +347,30 @@ vc4_validate_cl(struct drm_device *dev, return 0; } +static bool +reloc_tex(struct exec_info *exec, + void *uniform_data_u, + struct vc4_texture_sample_info *sample, + uint32_t texture_handle_index) + +{ + struct drm_gem_cma_object *tex; + uint32_t unvalidated_p0 = *(uint32_t *)(uniform_data_u + + sample->p_offset[0]); + uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0]; + + if (texture_handle_index >= exec->bo_count) { + DRM_ERROR("texture handle index %d >= %d\n", + texture_handle_index, exec->bo_count); + return false; + } + tex = exec->bo[texture_handle_index]; + + *validated_p0 = tex->paddr + unvalidated_p0; + + return true; +} + static int validate_shader_rec(struct drm_device *dev, struct exec_info *exec, @@ -358,45 +382,54 @@ validate_shader_rec(struct drm_device *dev, uint32_t *src_handles = unvalidated; void *src_pkt; void *dst_pkt = validated; - static const int gl_bo_offsets[] = { - 4, 8, /* fs code, ubo */ - 16, 20, /* vs code, ubo */ - 28, 32, /* cs code, ubo */ + enum shader_rec_reloc_type { + RELOC_CODE, + RELOC_VBO, + }; + struct shader_rec_reloc { + enum shader_rec_reloc_type type; + uint32_t offset; + }; + static const struct shader_rec_reloc gl_relocs[] = { + { RELOC_CODE, 4 }, /* fs */ + { RELOC_CODE, 16 }, /* vs */ + { RELOC_CODE, 28 }, /* cs */ }; - static const int nv_bo_offsets[] = { - 4, 8, /* fs code, ubo */ - 12, /* vbo */ + static const struct shader_rec_reloc nv_relocs[] = { + { RELOC_CODE, 4 }, /* fs */ + { RELOC_VBO, 12 } }; - struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_bo_offsets) + 8]; - const int *bo_offsets; - uint32_t nr_attributes = 0, nr_bo, packet_size; + const struct shader_rec_reloc *relocs; + struct drm_gem_cma_object *bo[ARRAY_SIZE(gl_relocs) + 8]; + uint32_t nr_attributes = 0, nr_relocs, packet_size; int i; + struct vc4_validated_shader_info *validated_shader = NULL; if (state->packet == VC4_PACKET_NV_SHADER_STATE) { - bo_offsets = nv_bo_offsets; - nr_bo = ARRAY_SIZE(nv_bo_offsets); + relocs = nv_relocs; + nr_relocs = ARRAY_SIZE(nv_relocs); packet_size = 16; } else { - bo_offsets = gl_bo_offsets; - nr_bo = ARRAY_SIZE(gl_bo_offsets); + relocs = gl_relocs; + nr_relocs = ARRAY_SIZE(gl_relocs); nr_attributes = state->addr & 0x7; if (nr_attributes == 0) nr_attributes = 8; packet_size = 36 + nr_attributes * 8; } - if ((nr_bo + nr_attributes) * 4 + packet_size > len) { + if ((nr_relocs + nr_attributes) * 4 + packet_size > len) { DRM_ERROR("overflowed shader packet read " "(handles %d, packet %d, len %d)\n", - (nr_bo + nr_attributes) * 4, packet_size, len); + (nr_relocs + nr_attributes) * 4, packet_size, len); return -EINVAL; } - src_pkt = unvalidated + 4 * (nr_bo + nr_attributes); + src_pkt = unvalidated + 4 * (nr_relocs + nr_attributes); memcpy(dst_pkt, src_pkt, packet_size); - for (i = 0; i < nr_bo + nr_attributes; i++) { + for (i = 0; i < nr_relocs + nr_attributes; i++) { if (src_handles[i] >= exec->bo_count) { DRM_ERROR("shader rec bo index %d > %d\n", src_handles[i], exec->bo_count); @@ -405,21 +438,73 @@ validate_shader_rec(struct drm_device *dev, bo[i] = exec->bo[src_handles[i]]; } - for (i = 0; i < nr_bo; i++) { - /* XXX: validation */ - uint32_t o = bo_offsets[i]; - *(uint32_t *)(dst_pkt + o) = - bo[i]->paddr + *(uint32_t *)(src_pkt + o); + for (i = 0; i < nr_relocs; i++) { + uint32_t o = relocs[i].offset; + uint32_t src_offset = *(uint32_t *)(src_pkt + o); + *(uint32_t *)(dst_pkt + o) = bo[i]->paddr + src_offset; + uint32_t *texture_handles_u; + void *uniform_data_u; + uint32_t tex; + + switch (relocs[i].type) { + case RELOC_CODE: + kfree(validated_shader); + validated_shader = vc4_validate_shader(bo[i], + src_offset); + if (!validated_shader) + goto fail; + + if (validated_shader->uniforms_src_size > + exec->uniforms_size) { + DRM_ERROR("Uniforms src buffer overflow\n"); + goto fail; + } + + texture_handles_u = exec->uniforms_u; + uniform_data_u = (texture_handles_u + + validated_shader->num_texture_samples); + + memcpy(exec->uniforms_v, uniform_data_u, + validated_shader->uniforms_size); + + for (tex = 0; + tex < validated_shader->num_texture_samples; + tex++) { + if (!reloc_tex(exec, + uniform_data_u, + &validated_shader->texture_samples[tex], + texture_handles_u[tex])) { + goto fail; + } + } + + *(uint32_t *)(dst_pkt + o + 4) = exec->uniforms_p; + + exec->uniforms_u += validated_shader->uniforms_src_size; + exec->uniforms_v += validated_shader->uniforms_size; + exec->uniforms_p += validated_shader->uniforms_size; + + break; + + case RELOC_VBO: + break; + } } for (i = 0; i < nr_attributes; i++) { /* XXX: validation */ uint32_t o = 36 + i * 8; *(uint32_t *)(dst_pkt + o) = - bo[nr_bo + i]->paddr + *(uint32_t *)(src_pkt + o); + bo[nr_relocs + i]->paddr + *(uint32_t *)(src_pkt + o); } + kfree(validated_shader); + return 0; + +fail: + kfree(validated_shader); + return -EINVAL; } int diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h index 4a2a2181ab4..885a754a9d5 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -26,15 +26,20 @@ #include #include +#include #include #include #include +#include "vc4_context.h" +#include "vc4_qpu_defines.h" + #define DRM_INFO(...) fprintf(stderr, __VA_ARGS__) #define DRM_ERROR(...) fprintf(stderr, __VA_ARGS__) #define kmalloc(size, arg) malloc(size) +#define kcalloc(size, count, arg) calloc(size, count) #define kfree(ptr) free(ptr) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define krealloc(ptr, size, args) realloc(ptr, size) #define roundup(x, y) align(x, y) static inline int @@ -64,6 +69,9 @@ struct drm_gem_cma_object { }; struct exec_info { + /* Kernel-space copy of the ioctl arguments */ + struct drm_vc4_submit_cl *args; + /* This is the array of BOs that were looked up at the start of exec. * Command validation will use indices into this array. */ @@ -79,9 +87,8 @@ struct exec_info { uint32_t bo_index[2]; uint32_t max_width, max_height; - /** - * This is the BO where we store the validated command lists - * and shader records. + /* This is the BO where we store the validated command lists, shader + * records, and uniforms. */ struct drm_gem_cma_object *exec_bo; @@ -108,6 +115,50 @@ struct exec_info { uint32_t ct0ca, ct0ea; uint32_t ct1ca, ct1ea; uint32_t shader_paddr; + + /* Pointers to the uniform data. These pointers are incremented, and + * size decremented, as each batch of uniforms is uploaded. + */ + void *uniforms_u; + void *uniforms_v; + uint32_t uniforms_p; + uint32_t uniforms_size; +}; + +/** + * struct vc4_texture_sample_info - saves the offsets into the UBO for texture + * setup parameters. + * + * This will be used at draw time to relocate the reference to the texture + * contents in p0, and validate that the offset combined with + * width/height/stride/etc. from p1 and p2/p3 doesn't sample outside the BO. + * Note that the hardware treats unprovided config parameters as 0, so not all + * of them need to be set up for every texure sample, and we'll store ~0 as + * the offset to mark the unused ones. + * + * See the VC4 3D architecture guide page 41 ("Texture and Memory Lookup Unit + * Setup") for definitions of the texture parameters. + */ +struct vc4_texture_sample_info { + uint32_t p_offset[4]; +}; + +/** + * struct vc4_validated_shader_info - information about validated shaders that + * needs to be used from command list validation. + * + * For a given shader, each time a shader state record references it, we need + * to verify that the shader doesn't read more uniforms than the shader state + * record's uniform BO pointer can provide, and we need to apply relocations + * and validate the shader state record's uniforms that define the texture + * samples. + */ +struct vc4_validated_shader_info +{ + uint32_t uniforms_size; + uint32_t uniforms_src_size; + uint32_t num_texture_samples; + struct vc4_texture_sample_info *texture_samples; }; int vc4_validate_cl(struct drm_device *dev, @@ -123,4 +174,8 @@ int vc4_validate_shader_recs(struct drm_device *dev, uint32_t len, struct exec_info *exec); +struct vc4_validated_shader_info * +vc4_validate_shader(struct drm_gem_cma_object *shader_obj, + uint32_t start_offset); + #endif /* VC4_SIMULATOR_VALIDATE_H */ diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate_shaders.c b/src/gallium/drivers/vc4/vc4_simulator_validate_shaders.c new file mode 100644 index 00000000000..c02deb406c7 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_simulator_validate_shaders.c @@ -0,0 +1,334 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * DOC: Shader validator for VC4. + * + * The VC4 has no IOMMU between it and system memory. So, a user with access + * to execute shaders could escalate privilege by overwriting system memory + * (using the VPM write address register in the general-purpose DMA mode) or + * reading system memory it shouldn't (reading it as a texture, or uniform + * data, or vertex data). + * + * This walks over a shader starting from some offset within a BO, ensuring + * that its accesses are appropriately bounded, and recording how many texture + * accesses are made and where so that we can do relocations for them in the + * uniform stream. + * + * The kernel API has shaders stored in user-mapped BOs. The BOs will be + * forcibly unmapped from the process before validation, and any cache of + * validated state will be flushed if the mapping is faulted back in. + * + * Storing the shaders in BOs means that the validation process will be slow + * due to uncached reads, but since shaders are long-lived and shader BOs are + * never actually modified, this shouldn't be a problem. + */ + +#include "vc4_simulator_validate.h" +#include "vc4_qpu.h" +#include "vc4_qpu_defines.h" + +struct vc4_shader_validation_state { + struct vc4_texture_sample_info tmu_setup[2]; + int tmu_write_count[2]; +}; + +static bool +is_tmu_write(uint32_t waddr) +{ + return (waddr >= QPU_W_TMU0_S && + waddr <= QPU_W_TMU1_B); +} + +static bool +check_register_write(uint32_t waddr, bool is_b) +{ + switch (waddr) { + case QPU_W_UNIFORMS_ADDRESS: + /* XXX: We'll probably need to support this for reladdr, but + * it's definitely a security-related one. + */ + DRM_ERROR("uniforms address load unsupported\n"); + return false; + + case QPU_W_TLB_COLOR_MS: + case QPU_W_TLB_COLOR_ALL: + case QPU_W_TLB_Z: + /* XXX: We need to track which buffers get written by the + * shader, to make sure that we have those buffers set up by + * the config packets. But we need to pass them for now to + * get things up and running. + */ + return true; + + case QPU_W_TMU0_S: + case QPU_W_TMU0_T: + case QPU_W_TMU0_R: + case QPU_W_TMU0_B: + case QPU_W_TMU1_S: + case QPU_W_TMU1_T: + case QPU_W_TMU1_R: + case QPU_W_TMU1_B: + /* XXX: We need to track where the uniforms get loaded for + * texturing so that we can do relocations, and to validate + * those uniform contents. + */ + return true; + + case QPU_W_HOST_INT: + case QPU_W_TMU_NOSWAP: + case QPU_W_TLB_STENCIL_SETUP: + case QPU_W_TLB_ALPHA_MASK: + case QPU_W_MUTEX_RELEASE: + /* XXX: I haven't thought about these, so don't support them + * for now. + */ + DRM_ERROR("Unsupported waddr %d\n", waddr); + return false; + + case QPU_W_VPM_ADDR: + DRM_ERROR("General VPM DMA unsupported\n"); + return false; + + case QPU_W_VPM: + case QPU_W_VPMVCD_SETUP: + /* We allow VPM setup in general, even including VPM DMA + * configuration setup, because the (unsafe) DMA can only be + * triggered by QPU_W_VPM_ADDR writes. + */ + return true; + } + + return true; +} + +static bool +record_validated_texture_sample(struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + int tmu) +{ + uint32_t s = validated_shader->num_texture_samples; + int i; + struct vc4_texture_sample_info *temp_samples; + + temp_samples = krealloc(validated_shader->texture_samples, + (s + 1) * sizeof(*temp_samples), + GFP_KERNEL); + if (!temp_samples) + return false; + + memcpy(temp_samples[s].p_offset, + validation_state->tmu_setup[tmu].p_offset, + validation_state->tmu_write_count[tmu] * sizeof(uint32_t)); + for (i = validation_state->tmu_write_count[tmu]; i < 4; i++) + temp_samples[s].p_offset[i] = ~0; + + validated_shader->num_texture_samples = s + 1; + validated_shader->texture_samples = temp_samples; + + return true; +} + +static bool +check_tmu_writes(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state, + uint32_t waddr) +{ + int tmu = waddr > QPU_W_TMU0_B; + + if (!is_tmu_write(waddr)) + return true; + + if (validation_state->tmu_write_count[tmu] >= 4) { + DRM_ERROR("TMU%d got too many parameters before dispatch\n", + tmu); + return false; + } + validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = + validated_shader->uniforms_size; + validation_state->tmu_write_count[tmu]++; + validated_shader->uniforms_size += 4; + + if (waddr == QPU_W_TMU0_S || waddr == QPU_W_TMU1_S) { + if (!record_validated_texture_sample(validated_shader, + validation_state, tmu)) { + return false; + } + + validation_state->tmu_write_count[tmu] = 0; + } + + return true; +} + +static bool +check_instruction_writes(uint64_t inst, + struct vc4_validated_shader_info *validated_shader, + struct vc4_shader_validation_state *validation_state) +{ + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + bool ws = inst & QPU_WS; + + if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) { + DRM_ERROR("ADD and MUL both set up textures\n"); + return false; + } + + if (!check_tmu_writes(inst, validated_shader, validation_state, + waddr_add)) { + return false; + } + + if (!check_tmu_writes(inst, validated_shader, validation_state, + waddr_mul)) { + return false; + } + + return (check_register_write(waddr_add, ws) && + check_register_write(waddr_mul, !ws)); +} + +static bool +check_instruction_reads(uint64_t inst, + struct vc4_validated_shader_info *validated_shader) +{ + uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); + uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); + uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); + uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + + if (raddr_a == QPU_R_UNIF || + raddr_b == QPU_R_UNIF) { + if (is_tmu_write(waddr_add) || is_tmu_write(waddr_mul)) { + DRM_ERROR("uniform read in the same instruction as " + "texture setup"); + return false; + } + + /* This can't overflow the uint32_t, because we're reading 8 + * bytes of instruction to increment by 4 here, so we'd + * already be OOM. + */ + validated_shader->uniforms_size += 4; + } + + return true; +} + +struct vc4_validated_shader_info * +vc4_validate_shader(struct drm_gem_cma_object *shader_obj, + uint32_t start_offset) +{ + bool found_shader_end = false; + int shader_end_ip = 0; + uint32_t ip, max_ip; + uint64_t *shader; + struct vc4_validated_shader_info *validated_shader; + struct vc4_shader_validation_state validation_state; + + memset(&validation_state, 0, sizeof(validation_state)); + + if (start_offset + sizeof(uint64_t) > shader_obj->base.size) { + DRM_ERROR("shader starting at %d outside of BO sized %d\n", + start_offset, + shader_obj->base.size); + return NULL; + } + shader = shader_obj->vaddr + start_offset; + max_ip = (shader_obj->base.size - start_offset) / sizeof(uint64_t); + + validated_shader = kcalloc(sizeof(*validated_shader), 1, GFP_KERNEL); + if (!validated_shader) + return NULL; + + for (ip = 0; ip < max_ip; ip++) { + uint64_t inst = shader[ip]; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); + + switch (sig) { + case QPU_SIG_NONE: + case QPU_SIG_WAIT_FOR_SCOREBOARD: + case QPU_SIG_SCOREBOARD_UNLOCK: + case QPU_SIG_LOAD_TMU0: + case QPU_SIG_LOAD_TMU1: + if (!check_instruction_writes(inst, validated_shader, + &validation_state)) { + DRM_ERROR("Bad write at ip %d\n", ip); + goto fail; + } + + if (!check_instruction_reads(inst, validated_shader)) + goto fail; + + break; + + case QPU_SIG_LOAD_IMM: + if (!check_instruction_writes(inst, validated_shader, + &validation_state)) { + DRM_ERROR("Bad LOAD_IMM write at ip %d\n", ip); + goto fail; + } + break; + + case QPU_SIG_PROG_END: + found_shader_end = true; + shader_end_ip = ip; + break; + + default: + DRM_ERROR("Unsupported QPU signal %d at " + "instruction %d\n", sig, ip); + goto fail; + } + + /* There are two delay slots after program end is signaled + * that are still executed, then we're finished. + */ + if (found_shader_end && ip == shader_end_ip + 2) + break; + } + + if (ip == max_ip) { + DRM_ERROR("shader starting at %d failed to terminate before " + "shader BO end at %d\n", + start_offset, + shader_obj->base.size); + goto fail; + } + + /* Again, no chance of integer overflow here because the worst case + * scenario is 8 bytes of uniforms plus handles per 8-byte + * instruction. + */ + validated_shader->uniforms_src_size = + (validated_shader->uniforms_size + + 4 * validated_shader->num_texture_samples); + + return validated_shader; + +fail: + kfree(validated_shader); + return NULL; +}