From: Alyssa Rosenzweig Date: Wed, 5 Aug 2020 20:16:00 +0000 (-0400) Subject: panfrost: Rename encoder/ to lib/ X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=1c62b5528ab09731388670202fa4a6ca5aa96534;p=mesa.git panfrost: Rename encoder/ to lib/ We'll want both encoding and decoding here, as a generic hardware interface library based on GenXML. Signed-off-by: Alyssa Rosenzweig Part-of: --- diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build index c3181e7fab3..89f877a5bb2 100644 --- a/src/gallium/drivers/panfrost/meson.build +++ b/src/gallium/drivers/panfrost/meson.build @@ -73,5 +73,5 @@ libpanfrost = static_library( driver_panfrost = declare_dependency( compile_args : compile_args_panfrost, - link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode, libpanfrost_encoder], + link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode, libpanfrost_lib], ) diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 8be2bb6f6f1..1e313c999db 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -53,7 +53,7 @@ #include "pan_blend_shaders.h" #include "pan_cmdstream.h" #include "pan_util.h" -#include "pandecode/decode.h" +#include "decode.h" #include "util/pan_lower_framebuffer.h" struct midgard_tiler_descriptor diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index d59aa22f049..10bca05b26a 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -36,7 +36,7 @@ #include "util/rounding.h" #include "pan_util.h" #include "pan_blending.h" -#include "pandecode/decode.h" +#include "decode.h" #include "panfrost-quirks.h" /* panfrost_bo_access is here to help us keep track of batch accesses to BOs diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c index 5d651ee44d1..e647a6c7f8b 100644 --- a/src/gallium/drivers/panfrost/pan_resource.c +++ b/src/gallium/drivers/panfrost/pan_resource.c @@ -47,7 +47,7 @@ #include "pan_resource.h" #include "pan_util.h" #include "pan_tiling.h" -#include "pandecode/decode.h" +#include "decode.h" #include "panfrost-quirks.h" static struct pipe_resource * diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index c2e4b9beb65..eed910fb3f8 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -48,7 +48,7 @@ #include "pan_resource.h" #include "pan_public.h" #include "pan_util.h" -#include "pandecode/decode.h" +#include "decode.h" #include "pan_context.h" #include "midgard/midgard_compile.h" diff --git a/src/panfrost/bifrost/test/bi_submit.c b/src/panfrost/bifrost/test/bi_submit.c index fac31b016e3..38539c09cbc 100644 --- a/src/panfrost/bifrost/test/bi_submit.c +++ b/src/panfrost/bifrost/test/bi_submit.c @@ -25,7 +25,7 @@ */ #include "bit.h" -#include "panfrost/pandecode/decode.h" +#include "panfrost/lib/decode.h" #include "drm-uapi/panfrost_drm.h" #include "panfrost/encoder/pan_encoder.h" diff --git a/src/panfrost/encoder/meson.build b/src/panfrost/encoder/meson.build deleted file mode 100644 index 754e7ce246c..00000000000 --- a/src/panfrost/encoder/meson.build +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright © 2018 Rob Clark -# Copyright © 2019 Collabora - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -libpanfrost_encoder_files = files( - 'pan_encoder.h', - - 'pan_afbc.c', - 'pan_attributes.c', - 'pan_bo.c', - 'pan_blit.c', - 'pan_format.c', - 'pan_invocation.c', - 'pan_sampler.c', - 'pan_tiler.c', - 'pan_texture.c', - 'pan_scoreboard.c', - 'pan_scratch.c', - 'pan_pool.c', - 'pan_props.c', -) - -libpanfrost_encoder = static_library( - 'panfrost_encoder', - [libpanfrost_encoder_files], - include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw], - c_args : [no_override_init_args], - gnu_symbol_visibility : 'hidden', - dependencies: [dep_libdrm, idep_nir], - build_by_default : false, -) diff --git a/src/panfrost/encoder/pan_afbc.c b/src/panfrost/encoder/pan_afbc.c deleted file mode 100644 index f1f62baffc9..00000000000 --- a/src/panfrost/encoder/pan_afbc.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include "pan_texture.h" - -/* Arm FrameBuffer Compression (AFBC) is a lossless compression scheme natively - * implemented in Mali GPUs (as well as many display controllers paired with - * Mali GPUs, etc). Where possible, Panfrost prefers to use AFBC for both - * rendering and texturing. In most cases, this is a performance-win due to a - * dramatic reduction in memory bandwidth and cache locality compared to a - * linear resources. - * - * AFBC divides the framebuffer into 16x16 tiles (other sizes possible, TODO: - * do we need to support this?). So, the width and height each must be aligned - * up to 16 pixels. This is inherently good for performance; note that for a 4 - * byte-per-pixel format like RGBA8888, that means that rows are 16*4=64 byte - * aligned, which is the cache-line size. - * - * For each AFBC-compressed resource, there is a single contiguous - * (CPU/GPU-shared) buffer. This buffer itself is divided into two parts: - * header and body, placed immediately after each other. - * - * The AFBC header contains 16 bytes of metadata per tile. - * - * The AFBC body is the same size as the original linear resource (padded to - * the nearest tile). Although the body comes immediately after the header, it - * must also be cache-line aligned, so there can sometimes be a bit of padding - * between the header and body. - * - * As an example, a 64x64 RGBA framebuffer contains 64/16 = 4 tiles horizontally and - * 4 tiles vertically. There are 4*4=16 tiles in total, each containing 16 - * bytes of metadata, so there is a 16*16=256 byte header. 64x64 is already - * tile aligned, so the body is 64*64 * 4 bytes per pixel = 16384 bytes of - * body. - * - * From userspace, Panfrost needs to be able to calculate these sizes. It - * explicitly does not and can not know the format of the data contained within - * this header and body. The GPU has native support for AFBC encode/decode. For - * an internal FBO or a framebuffer used for scanout with an AFBC-compatible - * winsys/display-controller, the buffer is maintained AFBC throughout flight, - * and the driver never needs to know the internal data. For edge cases where - * the driver really does need to read/write from the AFBC resource, we - * generate a linear staging buffer and use the GPU to blit AFBC<--->linear. - * TODO: Implement me. */ - -#define AFBC_TILE_WIDTH 16 -#define AFBC_TILE_HEIGHT 16 -#define AFBC_HEADER_BYTES_PER_TILE 16 -#define AFBC_CACHE_ALIGN 64 - -/* Is it possible to AFBC compress a particular format? Common formats (and - * YUV) are compressible. Some obscure formats are not and fallback on linear, - * at a performance hit. Also, if you need to disable AFBC entirely in the - * driver for debug/profiling, just always return false here. */ - -bool -panfrost_format_supports_afbc(enum pipe_format format) -{ - const struct util_format_description *desc = - util_format_description(format); - - /* sRGB cannot be AFBC, but it can be tiled. TODO: Verify. The blob - * does not do AFBC for SRGB8_ALPHA8, but it's not clear why it - * shouldn't be able to. */ - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) - return false; - - if (util_format_is_rgba8_variant(desc)) - return true; - - /* Only Z24S8 variants are compressible as Z/S */ - - if (panfrost_is_z24s8_variant(format)) - return true; - - /* Lookup special formats */ - switch (format) { - case PIPE_FORMAT_R8G8B8_UNORM: - case PIPE_FORMAT_B8G8R8_UNORM: - case PIPE_FORMAT_R5G6B5_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - return true; - default: - return false; - } -} - -unsigned -panfrost_afbc_header_size(unsigned width, unsigned height) -{ - /* Align to tile */ - unsigned aligned_width = ALIGN_POT(width, AFBC_TILE_WIDTH); - unsigned aligned_height = ALIGN_POT(height, AFBC_TILE_HEIGHT); - - /* Compute size in tiles, rather than pixels */ - unsigned tile_count_x = aligned_width / AFBC_TILE_WIDTH; - unsigned tile_count_y = aligned_height / AFBC_TILE_HEIGHT; - unsigned tile_count = tile_count_x * tile_count_y; - - /* Multiply to find the header size */ - unsigned header_bytes = tile_count * AFBC_HEADER_BYTES_PER_TILE; - - /* Align and go */ - return ALIGN_POT(header_bytes, AFBC_CACHE_ALIGN); - -} diff --git a/src/panfrost/encoder/pan_attributes.c b/src/panfrost/encoder/pan_attributes.c deleted file mode 100644 index d0d79486185..00000000000 --- a/src/panfrost/encoder/pan_attributes.c +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include "util/u_math.h" -#include "panfrost-job.h" -#include "pan_encoder.h" - -/* This file handles attribute descriptors (mali_attr_meta). The - * bulk of the complexity is from instancing. See mali_job for - * notes on how this works. But basically, for small vertex - * counts, we have a lookup table, and for large vertex counts, - * we look at the high bits as a heuristic. This has to match - * exactly how the hardware calculates this (which is why the - * algorithm is so weird) or else instancing will break. */ - -/* Given an odd number (of the form 2k + 1), compute k */ -#define ODD(odd) ((odd - 1) >> 1) - -static unsigned -panfrost_small_padded_vertex_count(unsigned idx) -{ - if (idx == 11 || idx == 13 || idx == 15 || idx == 19) - return idx + 1; - else - return idx; -} - -static unsigned -panfrost_large_padded_vertex_count(uint32_t vertex_count) -{ - /* First, we have to find the highest set one */ - unsigned highest = 32 - __builtin_clz(vertex_count); - - /* Using that, we mask out the highest 4-bits */ - unsigned n = highest - 4; - unsigned nibble = (vertex_count >> n) & 0xF; - - /* Great, we have the nibble. Now we can just try possibilities. Note - * that we don't care about the bottom most bit in most cases, and we - * know the top bit must be 1 */ - - unsigned middle_two = (nibble >> 1) & 0x3; - - switch (middle_two) { - case 0b00: - if (!(nibble & 1)) - return (1 << n) * 9; - else - return (1 << (n + 1)) * 5; - case 0b01: - return (1 << (n + 2)) * 3; - case 0b10: - return (1 << (n + 1)) * 7; - case 0b11: - return (1 << (n + 4)); - default: - return 0; /* unreachable */ - } -} - -unsigned -panfrost_padded_vertex_count(unsigned vertex_count) -{ - if (vertex_count < 20) - return panfrost_small_padded_vertex_count(vertex_count); - else - return panfrost_large_padded_vertex_count(vertex_count); -} - -/* The much, much more irritating case -- instancing is enabled. See - * panfrost_job.h for notes on how this works */ - -static unsigned -panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags) -{ - /* We have a NPOT divisor. Here's the fun one (multipling by - * the inverse and shifting) */ - - /* floor(log2(d)) */ - unsigned shift = util_logbase2(hw_divisor); - - /* m = ceil(2^(32 + shift) / d) */ - uint64_t shift_hi = 32 + shift; - uint64_t t = 1ll << shift_hi; - double t_f = t; - double hw_divisor_d = hw_divisor; - double m_f = ceil(t_f / hw_divisor_d); - unsigned m = m_f; - - /* Default case */ - uint32_t magic_divisor = m; - - /* e = 2^(shift + 32) % d */ - uint64_t e = t % hw_divisor; - - /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob - * seems to use a different condition */ - if (e <= (1ll << shift)) { - magic_divisor = m - 1; - *extra_flags = 1; - } - - /* Top flag implicitly set */ - assert(magic_divisor & (1u << 31)); - magic_divisor &= ~(1u << 31); - *o_shift = shift; - - return magic_divisor; -} - -unsigned -panfrost_vertex_instanced( - unsigned padded_count, - unsigned instance_shift, unsigned instance_odd, - unsigned divisor, - union mali_attr *attrs) -{ - /* Depending if there is an instance divisor or not, packing varies. - * When there is a divisor, the hardware-level divisor is actually the - * product of the instance divisor and the padded count */ - - unsigned hw_divisor = padded_count * divisor; - - if (divisor == 0) { - /* Per-vertex attributes use the MODULO mode. First, compute - * the modulus */ - - attrs->elements |= MALI_ATTR_MODULO; - attrs->shift = instance_shift; - attrs->extra_flags = instance_odd; - - return 1; - } else if (util_is_power_of_two_or_zero(hw_divisor)) { - /* If there is a divisor but the hardware divisor works out to - * a power of two (not terribly exceptional), we can use an - * easy path (just shifting) */ - - attrs->elements |= MALI_ATTR_POT_DIVIDE; - attrs->shift = __builtin_ctz(hw_divisor); - - return 1; - } else { - unsigned shift = 0, extra_flags = 0; - - attrs[1].magic_divisor = - panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); - - /* Upload to two different slots */ - - attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE; - attrs[0].shift = shift; - attrs[0].extra_flags = extra_flags; - - attrs[1].unk = 0x20; - attrs[1].zero = 0; - attrs[1].divisor = divisor; - - return 2; - } -} - -/* Records for gl_VertexID and gl_InstanceID use a slightly special encoding, - * but the idea is the same */ - -void -panfrost_vertex_id( - unsigned padded_count, - union mali_attr *attr) -{ - /* We factor the padded count as shift/odd and that's it */ - - attr->elements = MALI_ATTR_VERTEXID; - attr->shift = __builtin_ctz(padded_count); - attr->extra_flags = padded_count >> (attr->shift + 1); - attr->stride = attr->size = 0; -} - -void -panfrost_instance_id( - unsigned padded_count, - union mali_attr *attr) -{ - attr->elements = MALI_ATTR_INSTANCEID; - attr->stride = 0; - attr->extra_flags = 0; - attr->size = 0; - - /* POT records have just a shift directly with an off-by-one for - * unclear reasons. NPOT records have a magic divisor smushed into the - * stride field (which is unused for these special records) */ - - if (util_is_power_of_two_or_zero(padded_count)) { - attr->shift = __builtin_ctz(padded_count) - 1; - } else { - unsigned shift = 0, flags = 0; - - attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags); - attr->shift = shift; - attr->extra_flags = flags; - } -} - diff --git a/src/panfrost/encoder/pan_blit.c b/src/panfrost/encoder/pan_blit.c deleted file mode 100644 index ece664bb5b9..00000000000 --- a/src/panfrost/encoder/pan_blit.c +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright (C) 2020 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include -#include -#include "pan_encoder.h" -#include "pan_pool.h" -#include "pan_scoreboard.h" -#include "pan_texture.h" -#include "panfrost-quirks.h" -#include "../midgard/midgard_compile.h" -#include "compiler/nir/nir_builder.h" -#include "util/u_math.h" - -/* On Midgard, the native blit infrastructure (via MFBD preloads) is broken or - * missing in many cases. We instead use software paths as fallbacks to - * implement blits, which are done as TILER jobs. No vertex shader is - * necessary since we can supply screen-space coordinates directly. - * - * This is primarily designed as a fallback for preloads but could be extended - * for other clears/blits if needed in the future. */ - -static void -panfrost_build_blit_shader(panfrost_program *program, unsigned gpu_id, gl_frag_result loc, nir_alu_type T, bool ms) -{ - bool is_colour = loc >= FRAG_RESULT_DATA0; - - nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_FRAGMENT, &midgard_nir_options, NULL); - nir_function *fn = nir_function_create(shader, "main"); - nir_function_impl *impl = nir_function_impl_create(fn); - - nir_variable *c_src = nir_variable_create(shader, nir_var_shader_in, glsl_vector_type(GLSL_TYPE_FLOAT, 2), "coord"); - nir_variable *c_out = nir_variable_create(shader, nir_var_shader_out, glsl_vector_type( - GLSL_TYPE_FLOAT, is_colour ? 4 : 1), "out"); - - c_src->data.location = VARYING_SLOT_TEX0; - c_out->data.location = loc; - - nir_builder _b; - nir_builder *b = &_b; - nir_builder_init(b, impl); - b->cursor = nir_before_block(nir_start_block(impl)); - - nir_ssa_def *coord = nir_load_var(b, c_src); - - nir_tex_instr *tex = nir_tex_instr_create(shader, ms ? 3 : 1); - - tex->dest_type = T; - - if (ms) { - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(nir_f2i32(b, coord)); - tex->coord_components = 2; - - tex->src[1].src_type = nir_tex_src_ms_index; - tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b)); - - tex->src[2].src_type = nir_tex_src_lod; - tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0)); - tex->sampler_dim = GLSL_SAMPLER_DIM_MS; - tex->op = nir_texop_txf_ms; - } else { - tex->op = nir_texop_tex; - - tex->src[0].src_type = nir_tex_src_coord; - tex->src[0].src = nir_src_for_ssa(coord); - tex->coord_components = 2; - - tex->sampler_dim = GLSL_SAMPLER_DIM_2D; - } - - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); - nir_builder_instr_insert(b, &tex->instr); - - if (is_colour) - nir_store_var(b, c_out, &tex->dest.ssa, 0xFF); - else - nir_store_var(b, c_out, nir_channel(b, &tex->dest.ssa, 0), 0xFF); - - midgard_compile_shader_nir(shader, program, false, 0, gpu_id, false, true); -} - -/* Compile and upload all possible blit shaders ahead-of-time to reduce draw - * time overhead. There's only ~30 of them at the moment, so this is fine */ - -void -panfrost_init_blit_shaders(struct panfrost_device *dev) -{ - static const struct { - gl_frag_result loc; - unsigned types; - } shader_descs[] = { - { FRAG_RESULT_DEPTH, 1 << PAN_BLIT_FLOAT }, - { FRAG_RESULT_STENCIL, 1 << PAN_BLIT_UINT }, - { FRAG_RESULT_DATA0, ~0 }, - { FRAG_RESULT_DATA1, ~0 }, - { FRAG_RESULT_DATA2, ~0 }, - { FRAG_RESULT_DATA3, ~0 }, - { FRAG_RESULT_DATA4, ~0 }, - { FRAG_RESULT_DATA5, ~0 }, - { FRAG_RESULT_DATA6, ~0 }, - { FRAG_RESULT_DATA7, ~0 } - }; - - nir_alu_type nir_types[PAN_BLIT_NUM_TYPES] = { - nir_type_float, - nir_type_uint, - nir_type_int - }; - - /* Total size = # of shaders * bytes per shader. There are - * shaders for each RT (so up to DATA7 -- overestimate is - * okay) and up to NUM_TYPES variants of each, * 2 for multisampling - * variants. These shaders are simple enough that they should be less - * than 8 quadwords each (again, overestimate is fine). */ - - unsigned offset = 0; - unsigned total_size = (FRAG_RESULT_DATA7 * PAN_BLIT_NUM_TYPES) - * (8 * 16) * 2; - - dev->blit_shaders.bo = panfrost_bo_create(dev, total_size, PAN_BO_EXECUTE); - - /* Don't bother generating multisampling variants if we don't actually - * support multisampling */ - bool has_ms = !(dev->quirks & MIDGARD_SFBD); - - for (unsigned ms = 0; ms <= has_ms; ++ms) { - for (unsigned i = 0; i < ARRAY_SIZE(shader_descs); ++i) { - unsigned loc = shader_descs[i].loc; - - for (enum pan_blit_type T = 0; T < PAN_BLIT_NUM_TYPES; ++T) { - if (!(shader_descs[i].types & (1 << T))) - continue; - - panfrost_program program; - panfrost_build_blit_shader(&program, dev->gpu_id, loc, - nir_types[T], ms); - - assert(offset + program.compiled.size < total_size); - memcpy(dev->blit_shaders.bo->cpu + offset, program.compiled.data, program.compiled.size); - - dev->blit_shaders.loads[loc][T][ms] = (dev->blit_shaders.bo->gpu + offset) | program.first_tag; - offset += ALIGN_POT(program.compiled.size, 64); - util_dynarray_fini(&program.compiled); - } - } - } -} - -/* Add a shader-based load on Midgard (draw-time for GL). Shaders are - * precached */ - -void -panfrost_load_midg( - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr blend_shader, - mali_ptr fbd, - mali_ptr coordinates, unsigned vertex_count, - struct pan_image *image, - unsigned loc) -{ - unsigned width = u_minify(image->width0, image->first_level); - unsigned height = u_minify(image->height0, image->first_level); - - struct mali_viewport viewport = { - .clip_minx = -INFINITY, - .clip_miny = -INFINITY, - .clip_maxx = INFINITY, - .clip_maxy = INFINITY, - .clip_minz = 0.0, - .clip_maxz = 1.0, - - .viewport0 = { 0, 0 }, - .viewport1 = { MALI_POSITIVE(width), MALI_POSITIVE(height) } - }; - - union mali_attr varying = { - .elements = coordinates | MALI_ATTR_LINEAR, - .stride = 4 * sizeof(float), - .size = 4 * sizeof(float) * vertex_count, - }; - - struct mali_attr_meta varying_meta = { - .index = 0, - .unknown1 = 2, - .swizzle = (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3), - .format = MALI_RGBA32F - }; - - struct mali_stencil_test stencil = { - .mask = 0xFF, - .func = MALI_FUNC_ALWAYS, - .sfail = MALI_STENCIL_REPLACE, - .dpfail = MALI_STENCIL_REPLACE, - .dppass = MALI_STENCIL_REPLACE, - }; - - union midgard_blend replace = { - .equation = { - .rgb_mode = 0x122, - .alpha_mode = 0x122, - .color_mask = MALI_MASK_R | MALI_MASK_G | MALI_MASK_B | MALI_MASK_A, - } - }; - - if (blend_shader) - replace.shader = blend_shader; - - /* Determine the sampler type needed. Stencil is always sampled as - * UINT. Pure (U)INT is always (U)INT. Everything else is FLOAT. */ - - enum pan_blit_type T = - (loc == FRAG_RESULT_STENCIL) ? PAN_BLIT_UINT : - (util_format_is_pure_uint(image->format)) ? PAN_BLIT_UINT : - (util_format_is_pure_sint(image->format)) ? PAN_BLIT_INT : - PAN_BLIT_FLOAT; - - bool ms = image->nr_samples > 1; - - struct mali_shader_meta shader_meta = { - .shader = pool->dev->blit_shaders.loads[loc][T][ms], - .sampler_count = 1, - .texture_count = 1, - .varying_count = 1, - .midgard1 = { - .flags_lo = 0x20, - .work_count = 4, - }, - .coverage_mask = 0xF, - .unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10, - .unknown2_4 = 0x4e0, - .stencil_mask_front = ~0, - .stencil_mask_back = ~0, - .stencil_front = stencil, - .stencil_back = stencil, - .blend = { - .shader = blend_shader - } - }; - - if (ms) - shader_meta.unknown2_3 |= MALI_HAS_MSAA | MALI_PER_SAMPLE; - else - shader_meta.unknown2_4 |= MALI_NO_MSAA; - - assert(shader_meta.shader); - - if (pool->dev->quirks & MIDGARD_SFBD) { - shader_meta.unknown2_4 |= (0x10 | MALI_NO_DITHER); - shader_meta.blend = replace; - - if (loc < FRAG_RESULT_DATA0) - shader_meta.blend.equation.color_mask = 0x0; - } - - if (loc == FRAG_RESULT_DEPTH) { - shader_meta.midgard1.flags_lo |= MALI_WRITES_Z; - shader_meta.unknown2_3 |= MALI_DEPTH_WRITEMASK; - } else if (loc == FRAG_RESULT_STENCIL) { - shader_meta.midgard1.flags_hi |= MALI_WRITES_S; - shader_meta.unknown2_4 |= MALI_STENCIL_TEST; - } else { - shader_meta.midgard1.flags_lo |= MALI_EARLY_Z; - } - - /* Create the texture descriptor. We partially compute the base address - * ourselves to account for layer, such that the texture descriptor - * itself is for a 2D texture with array size 1 even for 3D/array - * textures, removing the need to separately key the blit shaders for - * 2D and 3D variants */ - - struct panfrost_transfer texture_t = panfrost_pool_alloc(pool, sizeof(struct mali_texture_descriptor) + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1)); - - panfrost_new_texture(texture_t.cpu, - image->width0, image->height0, - MAX2(image->nr_samples, 1), 1, - image->format, MALI_TEX_2D, - image->layout, - image->first_level, image->last_level, - 0, 0, - image->nr_samples, - 0, - (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9), - image->bo->gpu + image->first_layer * - panfrost_get_layer_stride(image->slices, - image->type == MALI_TEX_3D, - image->cubemap_stride, image->first_level), - image->slices); - - struct mali_sampler_descriptor sampler = { - .filter_mode = MALI_SAMP_MAG_NEAREST | MALI_SAMP_MIN_NEAREST, - .wrap_s = MALI_WRAP_CLAMP_TO_EDGE, - .wrap_t = MALI_WRAP_CLAMP_TO_EDGE, - .wrap_r = MALI_WRAP_CLAMP_TO_EDGE, - }; - - struct panfrost_transfer shader_meta_t = panfrost_pool_alloc(pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt)); - memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta)); - - for (unsigned i = 0; i < 8; ++i) { - void *dest = shader_meta_t.cpu + sizeof(shader_meta) + sizeof(struct midgard_blend_rt) * i; - - if (loc == (FRAG_RESULT_DATA0 + i)) { - struct midgard_blend_rt blend_rt = { - .flags = 0x200 | MALI_BLEND_NO_DITHER, - .blend = replace, - }; - - if (util_format_is_srgb(image->format)) - blend_rt.flags |= MALI_BLEND_SRGB; - - if (blend_shader) { - blend_rt.flags |= MALI_BLEND_MRT_SHADER; - blend_rt.blend.shader = blend_shader; - } - - memcpy(dest, &blend_rt, sizeof(struct midgard_blend_rt)); - } else { - memset(dest, 0x0, sizeof(struct midgard_blend_rt)); - } - } - - struct midgard_payload_vertex_tiler payload = { - .prefix = { - .draw_mode = MALI_TRIANGLES, - .unknown_draw = 0x3000, - .index_count = MALI_POSITIVE(vertex_count) - }, - .postfix = { - .gl_enables = 0x7, - .position_varying = coordinates, - .textures = panfrost_pool_upload(pool, &texture_t.gpu, sizeof(texture_t.gpu)), - .sampler_descriptor = panfrost_pool_upload(pool, &sampler, sizeof(sampler)), - .shader = shader_meta_t.gpu, - .varyings = panfrost_pool_upload(pool, &varying, sizeof(varying)), - .varying_meta = panfrost_pool_upload(pool, &varying_meta, sizeof(varying_meta)), - .viewport = panfrost_pool_upload(pool, &viewport, sizeof(viewport)), - .shared_memory = fbd - } - }; - - panfrost_pack_work_groups_compute(&payload.prefix, 1, vertex_count, 1, 1, 1, 1, true); - payload.prefix.workgroups_x_shift_3 = 6; - - panfrost_new_job(pool, scoreboard, JOB_TYPE_TILER, false, 0, &payload, sizeof(payload), true); -} diff --git a/src/panfrost/encoder/pan_bo.c b/src/panfrost/encoder/pan_bo.c deleted file mode 100644 index 71bc109060d..00000000000 --- a/src/panfrost/encoder/pan_bo.c +++ /dev/null @@ -1,514 +0,0 @@ -/* - * Copyright 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig - */ -#include -#include -#include -#include -#include -#include "drm-uapi/panfrost_drm.h" - -#include "pan_bo.h" -#include "pan_util.h" -#include "../pandecode/public.h" - -#include "os/os_mman.h" - -#include "util/u_inlines.h" -#include "util/u_math.h" - -/* This file implements a userspace BO cache. Allocating and freeing - * GPU-visible buffers is very expensive, and even the extra kernel roundtrips - * adds more work than we would like at this point. So caching BOs in userspace - * solves both of these problems and does not require kernel updates. - * - * Cached BOs are sorted into a bucket based on rounding their size down to the - * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo - * objects. Putting a BO into the cache is accomplished by adding it to the - * corresponding bucket. Getting a BO from the cache consists of finding the - * appropriate bucket and sorting. A cache eviction is a kernel-level free of a - * BO and removing it from the bucket. We special case evicting all BOs from - * the cache, since that's what helpful in practice and avoids extra logic - * around the linked list. - */ - -static struct panfrost_bo * -panfrost_bo_alloc(struct panfrost_device *dev, size_t size, - uint32_t flags) -{ - struct drm_panfrost_create_bo create_bo = { .size = size }; - struct panfrost_bo *bo; - int ret; - - if (dev->kernel_version->version_major > 1 || - dev->kernel_version->version_minor >= 1) { - if (flags & PAN_BO_GROWABLE) - create_bo.flags |= PANFROST_BO_HEAP; - if (!(flags & PAN_BO_EXECUTE)) - create_bo.flags |= PANFROST_BO_NOEXEC; - } - - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); - if (ret) { - fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); - return NULL; - } - - bo = pan_lookup_bo(dev, create_bo.handle); - assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo))); - - bo->size = create_bo.size; - bo->gpu = create_bo.offset; - bo->gem_handle = create_bo.handle; - bo->flags = flags; - bo->dev = dev; - return bo; -} - -static void -panfrost_bo_free(struct panfrost_bo *bo) -{ - struct drm_gem_close gem_close = { .handle = bo->gem_handle }; - int ret; - - ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); - if (ret) { - fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); - assert(0); - } - - /* BO will be freed with the sparse array, but zero to indicate free */ - memset(bo, 0, sizeof(*bo)); -} - -/* Returns true if the BO is ready, false otherwise. - * access_type is encoding the type of access one wants to ensure is done. - * Waiting is always done for writers, but if wait_readers is set then readers - * are also waited for. - */ -bool -panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) -{ - struct drm_panfrost_wait_bo req = { - .handle = bo->gem_handle, - .timeout_ns = timeout_ns, - }; - int ret; - - /* If the BO has been exported or imported we can't rely on the cached - * state, we need to call the WAIT_BO ioctl. - */ - if (!(bo->flags & PAN_BO_SHARED)) { - /* If ->gpu_access is 0, the BO is idle, no need to wait. */ - if (!bo->gpu_access) - return true; - - /* If the caller only wants to wait for writers and no - * writes are pending, we don't have to wait. - */ - if (!wait_readers && !(bo->gpu_access & PAN_BO_ACCESS_WRITE)) - return true; - } - - /* The ioctl returns >= 0 value when the BO we are waiting for is ready - * -1 otherwise. - */ - ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); - if (ret != -1) { - /* Set gpu_access to 0 so that the next call to bo_wait() - * doesn't have to call the WAIT_BO ioctl. - */ - bo->gpu_access = 0; - return true; - } - - /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed - * is invalid, which shouldn't happen here. - */ - assert(errno == ETIMEDOUT || errno == EBUSY); - return false; -} - -/* Helper to calculate the bucket index of a BO */ - -static unsigned -pan_bucket_index(unsigned size) -{ - /* Round down to POT to compute a bucket index */ - - unsigned bucket_index = util_logbase2(size); - - /* Clamp the bucket index; all huge allocations will be - * sorted into the largest bucket */ - - bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET); - - /* The minimum bucket size must equal the minimum allocation - * size; the maximum we clamped */ - - assert(bucket_index >= MIN_BO_CACHE_BUCKET); - assert(bucket_index <= MAX_BO_CACHE_BUCKET); - - /* Reindex from 0 */ - return (bucket_index - MIN_BO_CACHE_BUCKET); -} - -static struct list_head * -pan_bucket(struct panfrost_device *dev, unsigned size) -{ - return &dev->bo_cache.buckets[pan_bucket_index(size)]; -} - -/* Tries to fetch a BO of sufficient size with the appropriate flags from the - * BO cache. If it succeeds, it returns that BO and removes the BO from the - * cache. If it fails, it returns NULL signaling the caller to allocate a new - * BO. */ - -static struct panfrost_bo * -panfrost_bo_cache_fetch(struct panfrost_device *dev, - size_t size, uint32_t flags, bool dontwait) -{ - pthread_mutex_lock(&dev->bo_cache.lock); - struct list_head *bucket = pan_bucket(dev, size); - struct panfrost_bo *bo = NULL; - - /* Iterate the bucket looking for something suitable */ - list_for_each_entry_safe(struct panfrost_bo, entry, bucket, - bucket_link) { - if (entry->size < size || entry->flags != flags) - continue; - - if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, - PAN_BO_ACCESS_RW)) - continue; - - struct drm_panfrost_madvise madv = { - .handle = entry->gem_handle, - .madv = PANFROST_MADV_WILLNEED, - }; - int ret; - - /* This one works, splice it out of the cache */ - list_del(&entry->bucket_link); - list_del(&entry->lru_link); - - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); - if (!ret && !madv.retained) { - panfrost_bo_free(entry); - continue; - } - /* Let's go! */ - bo = entry; - break; - } - pthread_mutex_unlock(&dev->bo_cache.lock); - - return bo; -} - -static void -panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev) -{ - struct timespec time; - - clock_gettime(CLOCK_MONOTONIC, &time); - list_for_each_entry_safe(struct panfrost_bo, entry, - &dev->bo_cache.lru, lru_link) { - /* We want all entries that have been used more than 1 sec - * ago to be dropped, others can be kept. - * Note the <= 2 check and not <= 1. It's here to account for - * the fact that we're only testing ->tv_sec, not ->tv_nsec. - * That means we might keep entries that are between 1 and 2 - * seconds old, but we don't really care, as long as unused BOs - * are dropped at some point. - */ - if (time.tv_sec - entry->last_used <= 2) - break; - - list_del(&entry->bucket_link); - list_del(&entry->lru_link); - panfrost_bo_free(entry); - } -} - -/* Tries to add a BO to the cache. Returns if it was - * successful */ - -static bool -panfrost_bo_cache_put(struct panfrost_bo *bo) -{ - struct panfrost_device *dev = bo->dev; - - if (bo->flags & PAN_BO_SHARED) - return false; - - pthread_mutex_lock(&dev->bo_cache.lock); - struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096)); - struct drm_panfrost_madvise madv; - struct timespec time; - - madv.handle = bo->gem_handle; - madv.madv = PANFROST_MADV_DONTNEED; - madv.retained = 0; - - drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); - - /* Add us to the bucket */ - list_addtail(&bo->bucket_link, bucket); - - /* Add us to the LRU list and update the last_used field. */ - list_addtail(&bo->lru_link, &dev->bo_cache.lru); - clock_gettime(CLOCK_MONOTONIC, &time); - bo->last_used = time.tv_sec; - - /* Let's do some cleanup in the BO cache while we hold the - * lock. - */ - panfrost_bo_cache_evict_stale_bos(dev); - pthread_mutex_unlock(&dev->bo_cache.lock); - - return true; -} - -/* Evicts all BOs from the cache. Called during context - * destroy or during low-memory situations (to free up - * memory that may be unused by us just sitting in our - * cache, but still reserved from the perspective of the - * OS) */ - -void -panfrost_bo_cache_evict_all( - struct panfrost_device *dev) -{ - pthread_mutex_lock(&dev->bo_cache.lock); - for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) { - struct list_head *bucket = &dev->bo_cache.buckets[i]; - - list_for_each_entry_safe(struct panfrost_bo, entry, bucket, - bucket_link) { - list_del(&entry->bucket_link); - list_del(&entry->lru_link); - panfrost_bo_free(entry); - } - } - pthread_mutex_unlock(&dev->bo_cache.lock); -} - -void -panfrost_bo_mmap(struct panfrost_bo *bo) -{ - struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle }; - int ret; - - if (bo->cpu) - return; - - ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo); - if (ret) { - fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n"); - assert(0); - } - - bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, - bo->dev->fd, mmap_bo.offset); - if (bo->cpu == MAP_FAILED) { - fprintf(stderr, "mmap failed: %p %m\n", bo->cpu); - assert(0); - } -} - -static void -panfrost_bo_munmap(struct panfrost_bo *bo) -{ - if (!bo->cpu) - return; - - if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) { - perror("munmap"); - abort(); - } - - bo->cpu = NULL; -} - -struct panfrost_bo * -panfrost_bo_create(struct panfrost_device *dev, size_t size, - uint32_t flags) -{ - struct panfrost_bo *bo; - - /* Kernel will fail (confusingly) with EPERM otherwise */ - assert(size > 0); - - /* To maximize BO cache usage, don't allocate tiny BOs */ - size = MAX2(size, 4096); - - /* GROWABLE BOs cannot be mmapped */ - if (flags & PAN_BO_GROWABLE) - assert(flags & PAN_BO_INVISIBLE); - - /* Before creating a BO, we first want to check the cache but without - * waiting for BO readiness (BOs in the cache can still be referenced - * by jobs that are not finished yet). - * If the cached allocation fails we fall back on fresh BO allocation, - * and if that fails too, we try one more time to allocate from the - * cache, but this time we accept to wait. - */ - bo = panfrost_bo_cache_fetch(dev, size, flags, true); - if (!bo) - bo = panfrost_bo_alloc(dev, size, flags); - if (!bo) - bo = panfrost_bo_cache_fetch(dev, size, flags, false); - - if (!bo) - fprintf(stderr, "BO creation failed\n"); - - assert(bo); - - /* Only mmap now if we know we need to. For CPU-invisible buffers, we - * never map since we don't care about their contents; they're purely - * for GPU-internal use. But we do trace them anyway. */ - - if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) - panfrost_bo_mmap(bo); - - p_atomic_set(&bo->refcnt, 1); - - if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { - if (flags & PAN_BO_INVISIBLE) - pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL); - else if (!(flags & PAN_BO_DELAY_MMAP)) - pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL); - } - - return bo; -} - -void -panfrost_bo_reference(struct panfrost_bo *bo) -{ - if (bo) { - ASSERTED int count = p_atomic_inc_return(&bo->refcnt); - assert(count != 1); - } -} - -void -panfrost_bo_unreference(struct panfrost_bo *bo) -{ - if (!bo) - return; - - /* Don't return to cache if there are still references */ - if (p_atomic_dec_return(&bo->refcnt)) - return; - - struct panfrost_device *dev = bo->dev; - - pthread_mutex_lock(&dev->bo_map_lock); - - /* Someone might have imported this BO while we were waiting for the - * lock, let's make sure it's still not referenced before freeing it. - */ - if (p_atomic_read(&bo->refcnt) == 0) { - /* When the reference count goes to zero, we need to cleanup */ - panfrost_bo_munmap(bo); - - /* Rather than freeing the BO now, we'll cache the BO for later - * allocations if we're allowed to. - */ - if (!panfrost_bo_cache_put(bo)) - panfrost_bo_free(bo); - - } - pthread_mutex_unlock(&dev->bo_map_lock); -} - -struct panfrost_bo * -panfrost_bo_import(struct panfrost_device *dev, int fd) -{ - struct panfrost_bo *bo; - struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; - ASSERTED int ret; - unsigned gem_handle; - - ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); - assert(!ret); - - pthread_mutex_lock(&dev->bo_map_lock); - bo = pan_lookup_bo(dev, gem_handle); - - if (!bo->dev) { - get_bo_offset.handle = gem_handle; - ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); - assert(!ret); - - bo->dev = dev; - bo->gpu = (mali_ptr) get_bo_offset.offset; - bo->size = lseek(fd, 0, SEEK_END); - bo->flags = PAN_BO_SHARED; - bo->gem_handle = gem_handle; - assert(bo->size > 0); - p_atomic_set(&bo->refcnt, 1); - // TODO map and unmap on demand? - panfrost_bo_mmap(bo); - } else { - /* bo->refcnt == 0 can happen if the BO - * was being released but panfrost_bo_import() acquired the - * lock before panfrost_bo_unreference(). In that case, refcnt - * is 0 and we can't use panfrost_bo_reference() directly, we - * have to re-initialize the refcnt(). - * Note that panfrost_bo_unreference() checks - * refcnt value just after acquiring the lock to - * make sure the object is not freed if panfrost_bo_import() - * acquired it in the meantime. - */ - if (p_atomic_read(&bo->refcnt) == 0) - p_atomic_set(&bo->refcnt, 1); - else - panfrost_bo_reference(bo); - assert(bo->cpu); - } - pthread_mutex_unlock(&dev->bo_map_lock); - - return bo; -} - -int -panfrost_bo_export(struct panfrost_bo *bo) -{ - struct drm_prime_handle args = { - .handle = bo->gem_handle, - .flags = DRM_CLOEXEC, - }; - - int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); - if (ret == -1) - return -1; - - bo->flags |= PAN_BO_SHARED; - return args.fd; -} - diff --git a/src/panfrost/encoder/pan_bo.h b/src/panfrost/encoder/pan_bo.h deleted file mode 100644 index 360b102de34..00000000000 --- a/src/panfrost/encoder/pan_bo.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * © Copyright 2019 Alyssa Rosenzweig - * © Copyright 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef __PAN_BO_H__ -#define __PAN_BO_H__ - -#include "util/list.h" -#include "pan_device.h" -#include - -/* Flags for allocated memory */ - -/* This memory region is executable */ -#define PAN_BO_EXECUTE (1 << 0) - -/* This memory region should be lazily allocated and grow-on-page-fault. Must - * be used in conjunction with INVISIBLE */ -#define PAN_BO_GROWABLE (1 << 1) - -/* This memory region should not be mapped to the CPU */ -#define PAN_BO_INVISIBLE (1 << 2) - -/* This region may not be used immediately and will not mmap on allocate - * (semantically distinct from INVISIBLE, which cannot never be mmaped) */ -#define PAN_BO_DELAY_MMAP (1 << 3) - -/* BO is shared across processes (imported or exported) and therefore cannot be - * cached locally */ -#define PAN_BO_SHARED (1 << 4) - -/* GPU access flags */ - -/* BO is either shared (can be accessed by more than one GPU batch) or private - * (reserved by a specific GPU job). */ -#define PAN_BO_ACCESS_PRIVATE (0 << 0) -#define PAN_BO_ACCESS_SHARED (1 << 0) - -/* BO is being read/written by the GPU */ -#define PAN_BO_ACCESS_READ (1 << 1) -#define PAN_BO_ACCESS_WRITE (1 << 2) -#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE) - -/* BO is accessed by the vertex/tiler job. */ -#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3) - -/* BO is accessed by the fragment job. */ -#define PAN_BO_ACCESS_FRAGMENT (1 << 4) - -struct panfrost_bo { - /* Must be first for casting */ - struct list_head bucket_link; - - /* Used to link the BO to the BO cache LRU list. */ - struct list_head lru_link; - - /* Store the time this BO was use last, so the BO cache logic can evict - * stale BOs. - */ - time_t last_used; - - /* Atomic reference count */ - int32_t refcnt; - - struct panfrost_device *dev; - - /* Mapping for the entire object (all levels) */ - uint8_t *cpu; - - /* GPU address for the object */ - mali_ptr gpu; - - /* Size of all entire trees */ - size_t size; - - int gem_handle; - - uint32_t flags; - - /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending - * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl - * when the BO is idle. - */ - uint32_t gpu_access; -}; - -bool -panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers); -void -panfrost_bo_reference(struct panfrost_bo *bo); -void -panfrost_bo_unreference(struct panfrost_bo *bo); -struct panfrost_bo * -panfrost_bo_create(struct panfrost_device *dev, size_t size, - uint32_t flags); -void -panfrost_bo_mmap(struct panfrost_bo *bo); -struct panfrost_bo * -panfrost_bo_import(struct panfrost_device *dev, int fd); -int -panfrost_bo_export(struct panfrost_bo *bo); -void -panfrost_bo_cache_evict_all(struct panfrost_device *dev); - -#endif /* __PAN_BO_H__ */ diff --git a/src/panfrost/encoder/pan_device.h b/src/panfrost/encoder/pan_device.h deleted file mode 100644 index b84c8e7cdae..00000000000 --- a/src/panfrost/encoder/pan_device.h +++ /dev/null @@ -1,147 +0,0 @@ -/************************************************************************** - * - * Copyright 2018-2019 Alyssa Rosenzweig - * Copyright 2018-2019 Collabora, Ltd. - * Copyright © 2015 Intel Corporation - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#ifndef PAN_DEVICE_H -#define PAN_DEVICE_H - -#include -#include "renderonly/renderonly.h" -#include "util/u_dynarray.h" -#include "util/bitset.h" -#include "util/list.h" -#include "util/sparse_array.h" - -#include - -/* Driver limits */ -#define PAN_MAX_CONST_BUFFERS 16 - -/* Transient slab size. This is a balance between fragmentation against cache - * locality and ease of bookkeeping */ - -#define TRANSIENT_SLAB_PAGES (32) /* 128kb */ -#define TRANSIENT_SLAB_SIZE (4096 * TRANSIENT_SLAB_PAGES) - -/* Maximum number of transient slabs so we don't need dynamic arrays. Most - * interesting Mali boards are 4GB RAM max, so if the entire RAM was filled - * with transient slabs, you could never exceed (4GB / TRANSIENT_SLAB_SIZE) - * allocations anyway. By capping, we can use a fixed-size bitset for tracking - * free slabs, eliminating quite a bit of complexity. We can pack the free - * state of 8 slabs into a single byte, so for 128kb transient slabs the bitset - * occupies a cheap 4kb of memory */ - -#define MAX_TRANSIENT_SLABS (1024*1024 / TRANSIENT_SLAB_PAGES) - -/* How many power-of-two levels in the BO cache do we want? 2^12 - * minimum chosen as it is the page size that all allocations are - * rounded to */ - -#define MIN_BO_CACHE_BUCKET (12) /* 2^12 = 4KB */ -#define MAX_BO_CACHE_BUCKET (22) /* 2^22 = 4MB */ - -/* Fencepost problem, hence the off-by-one */ -#define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1) - -/* Cache for blit shaders. Defined here so they can be cached with the device */ - -enum pan_blit_type { - PAN_BLIT_FLOAT = 0, - PAN_BLIT_UINT, - PAN_BLIT_INT, - PAN_BLIT_NUM_TYPES, -}; - -#define PAN_BLIT_NUM_TARGETS (12) - -struct pan_blit_shaders { - struct panfrost_bo *bo; - mali_ptr loads[PAN_BLIT_NUM_TARGETS][PAN_BLIT_NUM_TYPES][2]; -}; - -struct panfrost_device { - /* For ralloc */ - void *memctx; - - int fd; - - /* Properties of the GPU in use */ - unsigned gpu_id; - unsigned core_count; - unsigned thread_tls_alloc; - unsigned quirks; - - /* Bitmask of supported compressed texture formats */ - uint32_t compressed_formats; - - /* debug flags, see pan_util.h how to interpret */ - unsigned debug; - - drmVersionPtr kernel_version; - - struct renderonly *ro; - - pthread_mutex_t bo_map_lock; - struct util_sparse_array bo_map; - - struct { - pthread_mutex_t lock; - - /* List containing all cached BOs sorted in LRU (Least - * Recently Used) order. This allows us to quickly evict BOs - * that are more than 1 second old. - */ - struct list_head lru; - - /* The BO cache is a set of buckets with power-of-two sizes - * ranging from 2^12 (4096, the page size) to - * 2^(12 + MAX_BO_CACHE_BUCKETS). - * Each bucket is a linked list of free panfrost_bo objects. */ - - struct list_head buckets[NR_BO_CACHE_BUCKETS]; - } bo_cache; - - struct pan_blit_shaders blit_shaders; -}; - -void -panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev); - -void -panfrost_close_device(struct panfrost_device *dev); - -bool -panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt); - -static inline struct panfrost_bo * -pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle) -{ - return util_sparse_array_get(&dev->bo_map, gem_handle); -} - -#endif diff --git a/src/panfrost/encoder/pan_encoder.h b/src/panfrost/encoder/pan_encoder.h deleted file mode 100644 index 42ec8320acb..00000000000 --- a/src/panfrost/encoder/pan_encoder.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig - */ - -#ifndef __PAN_ENCODER_H -#define __PAN_ENCODER_H - -#include -#include "panfrost-job.h" - -/* Invocation packing */ - -void -panfrost_pack_work_groups_compute( - struct mali_vertex_tiler_prefix *out, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z, - bool quirk_graphics); - -void -panfrost_pack_work_groups_fused( - struct mali_vertex_tiler_prefix *vertex, - struct mali_vertex_tiler_prefix *tiler, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z); - -/* Tiler structure size computation */ - -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); - -unsigned -panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); - -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count, bool hierarchy); - -/* Stack sizes */ - -unsigned -panfrost_get_stack_shift(unsigned stack_size); - -unsigned -panfrost_get_total_stack_size( - unsigned stack_shift, - unsigned threads_per_core, - unsigned core_count); - -/* Property queries */ - - -unsigned panfrost_query_gpu_version(int fd); -unsigned panfrost_query_core_count(int fd); -unsigned panfrost_query_thread_tls_alloc(int fd); - -const char * panfrost_model_name(unsigned gpu_id); - -/* Attributes / instancing */ - -unsigned -panfrost_padded_vertex_count(unsigned vertex_count); - -unsigned -panfrost_vertex_instanced( - unsigned padded_count, - unsigned instance_shift, unsigned instance_odd, - unsigned divisor, - union mali_attr *attrs); - -void panfrost_vertex_id(unsigned padded_count, union mali_attr *attr); -void panfrost_instance_id(unsigned padded_count, union mali_attr *attr); - -/* Samplers */ - -enum mali_func -panfrost_flip_compare_func(enum mali_func f); - - - -#endif diff --git a/src/panfrost/encoder/pan_format.c b/src/panfrost/encoder/pan_format.c deleted file mode 100644 index 76006164a36..00000000000 --- a/src/panfrost/encoder/pan_format.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include -#include "panfrost-job.h" -#include "pan_texture.h" - -/* Convenience */ - -#define _V PIPE_BIND_VERTEX_BUFFER -#define _T PIPE_BIND_SAMPLER_VIEW -#define _R PIPE_BIND_RENDER_TARGET -#define _Z PIPE_BIND_DEPTH_STENCIL -#define _VT (_V | _T) -#define _VTR (_V | _T | _R) -#define _TZ (_T | _Z) - -struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT] = { - [PIPE_FORMAT_ETC1_RGB8] = { MALI_ETC2_RGB8, _T }, - [PIPE_FORMAT_ETC2_RGB8] = { MALI_ETC2_RGB8, _T }, - [PIPE_FORMAT_ETC2_SRGB8] = { MALI_ETC2_RGB8, _T }, - [PIPE_FORMAT_ETC2_R11_UNORM] = { MALI_ETC2_R11_UNORM, _T }, - [PIPE_FORMAT_ETC2_RGBA8] = { MALI_ETC2_RGBA8, _T }, - [PIPE_FORMAT_ETC2_SRGBA8] = { MALI_ETC2_RGBA8, _T }, - [PIPE_FORMAT_ETC2_RG11_UNORM] = { MALI_ETC2_RG11_UNORM, _T }, - [PIPE_FORMAT_ETC2_R11_SNORM] = { MALI_ETC2_R11_SNORM, _T }, - [PIPE_FORMAT_ETC2_RG11_SNORM] = { MALI_ETC2_RG11_SNORM, _T }, - [PIPE_FORMAT_ETC2_RGB8A1] = { MALI_ETC2_RGB8A1, _T }, - [PIPE_FORMAT_ETC2_SRGB8A1] = { MALI_ETC2_RGB8A1, _T }, - - [PIPE_FORMAT_DXT1_RGB] = { MALI_BC1_UNORM, _T }, - [PIPE_FORMAT_DXT1_RGBA] = { MALI_BC1_UNORM, _T }, - [PIPE_FORMAT_DXT1_SRGB] = { MALI_BC1_UNORM, _T }, - [PIPE_FORMAT_DXT1_SRGBA] = { MALI_BC1_UNORM, _T }, - [PIPE_FORMAT_DXT3_RGBA] = { MALI_BC2_UNORM, _T }, - [PIPE_FORMAT_DXT3_SRGBA] = { MALI_BC2_UNORM, _T }, - [PIPE_FORMAT_DXT5_RGBA] = { MALI_BC3_UNORM, _T }, - [PIPE_FORMAT_DXT5_SRGBA] = { MALI_BC3_UNORM, _T }, - - [PIPE_FORMAT_RGTC1_UNORM] = { MALI_BC4_UNORM, _T }, - [PIPE_FORMAT_RGTC1_SNORM] = { MALI_BC4_SNORM, _T }, - [PIPE_FORMAT_RGTC2_UNORM] = { MALI_BC5_UNORM, _T }, - [PIPE_FORMAT_RGTC2_SNORM] = { MALI_BC5_SNORM, _T }, - - [PIPE_FORMAT_BPTC_RGB_FLOAT] = { MALI_BC6H_SF16, _T }, - [PIPE_FORMAT_BPTC_RGB_UFLOAT] = { MALI_BC6H_UF16, _T }, - [PIPE_FORMAT_BPTC_RGBA_UNORM] = { MALI_BC7_UNORM, _T }, - [PIPE_FORMAT_BPTC_SRGBA] = { MALI_BC7_UNORM, _T }, - - [PIPE_FORMAT_ASTC_4x4] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_5x4] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_5x5] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_6x5] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_6x6] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_8x5] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_8x6] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_8x8] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_10x5] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_10x6] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_10x8] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_10x10] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_12x10] = { MALI_ASTC_2D_HDR, _T }, - [PIPE_FORMAT_ASTC_12x12] = { MALI_ASTC_2D_HDR, _T }, - - [PIPE_FORMAT_ASTC_4x4_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_5x4_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_5x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_6x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_6x6_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_8x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_8x6_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_8x8_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_10x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_10x6_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_10x8_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_10x10_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_12x10_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_ASTC_12x12_SRGB] = { MALI_ASTC_2D_LDR, _T }, - [PIPE_FORMAT_B5G6R5_UNORM] = { MALI_RGB565, _VTR }, - [PIPE_FORMAT_B5G5R5X1_UNORM] = { MALI_RGB5_X1_UNORM, _VT }, - [PIPE_FORMAT_R5G5B5A1_UNORM] = { MALI_RGB5_A1_UNORM, _VTR }, - - [PIPE_FORMAT_R10G10B10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, - [PIPE_FORMAT_B10G10R10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, - [PIPE_FORMAT_R10G10B10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, - [PIPE_FORMAT_B10G10R10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, - [PIPE_FORMAT_R10G10B10X2_SNORM] = { MALI_RGB10_A2_SNORM, _VT }, - [PIPE_FORMAT_R10G10B10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT }, - [PIPE_FORMAT_B10G10R10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT }, - [PIPE_FORMAT_R10G10B10A2_UINT] = { MALI_RGB10_A2UI, _VTR }, - [PIPE_FORMAT_B10G10R10A2_UINT] = { MALI_RGB10_A2UI, _VTR }, - [PIPE_FORMAT_R10G10B10A2_USCALED] = { MALI_RGB10_A2UI, _VTR }, - [PIPE_FORMAT_B10G10R10A2_USCALED] = { MALI_RGB10_A2UI, _VTR }, - [PIPE_FORMAT_R10G10B10A2_SINT] = { MALI_RGB10_A2I, _VTR}, - [PIPE_FORMAT_B10G10R10A2_SINT] = { MALI_RGB10_A2I, _VTR }, - [PIPE_FORMAT_R10G10B10A2_SSCALED] = { MALI_RGB10_A2I, _VTR }, - [PIPE_FORMAT_B10G10R10A2_SSCALED] = { MALI_RGB10_A2I, _VTR }, - - [PIPE_FORMAT_R8_SSCALED] = { MALI_R8I, _V }, - [PIPE_FORMAT_R8G8_SSCALED] = { MALI_RG8I, _V }, - [PIPE_FORMAT_R8G8B8_SSCALED] = { MALI_RGB8I, _V }, - [PIPE_FORMAT_B8G8R8_SSCALED] = { MALI_RGB8I, _V }, - [PIPE_FORMAT_R8G8B8A8_SSCALED] = { MALI_RGBA8I, _V }, - [PIPE_FORMAT_B8G8R8A8_SSCALED] = { MALI_RGBA8I, _V }, - [PIPE_FORMAT_A8B8G8R8_SSCALED] = { MALI_RGBA8I, _V }, - - [PIPE_FORMAT_R8_USCALED] = { MALI_R8UI, _V }, - [PIPE_FORMAT_R8G8_USCALED] = { MALI_RG8UI, _V }, - [PIPE_FORMAT_R8G8B8_USCALED] = { MALI_RGB8UI, _V }, - [PIPE_FORMAT_B8G8R8_USCALED] = { MALI_RGB8UI, _V }, - [PIPE_FORMAT_R8G8B8A8_USCALED] = { MALI_RGBA8UI, _V }, - [PIPE_FORMAT_B8G8R8A8_USCALED] = { MALI_RGBA8UI, _V }, - [PIPE_FORMAT_A8B8G8R8_USCALED] = { MALI_RGBA8UI, _V }, - - [PIPE_FORMAT_R16_USCALED] = { MALI_R16UI, _V }, - [PIPE_FORMAT_R16G16_USCALED] = { MALI_RG16UI, _V }, - [PIPE_FORMAT_R16G16B16_USCALED] = { MALI_RGB16UI, _V }, - [PIPE_FORMAT_R16G16B16A16_USCALED] = { MALI_RGBA16UI, _V }, - [PIPE_FORMAT_R16_SSCALED] = { MALI_R16I, _V }, - [PIPE_FORMAT_R16G16_SSCALED] = { MALI_RG16I, _V }, - [PIPE_FORMAT_R16G16B16_SSCALED] = { MALI_RGB16I, _V }, - [PIPE_FORMAT_R16G16B16A16_SSCALED] = { MALI_RGBA16I, _V }, - - [PIPE_FORMAT_R32_USCALED] = { MALI_R32UI, _V }, - [PIPE_FORMAT_R32G32_USCALED] = { MALI_RG32UI, _V }, - [PIPE_FORMAT_R32G32B32_USCALED] = { MALI_RGB32UI, _V }, - [PIPE_FORMAT_R32G32B32A32_USCALED] = { MALI_RGBA32UI, _V }, - [PIPE_FORMAT_R32_SSCALED] = { MALI_R32I, _V }, - [PIPE_FORMAT_R32G32_SSCALED] = { MALI_RG32I, _V }, - [PIPE_FORMAT_R32G32B32_SSCALED] = { MALI_RGB32I, _V }, - [PIPE_FORMAT_R32G32B32A32_SSCALED] = { MALI_RGBA32I, _V }, - - [PIPE_FORMAT_R3G3B2_UNORM] = { MALI_RGB332_UNORM, _VT }, - - [PIPE_FORMAT_Z24_UNORM_S8_UINT] = { MALI_Z24X8_UNORM, _TZ }, - [PIPE_FORMAT_Z24X8_UNORM] = { MALI_Z24X8_UNORM, _TZ }, - [PIPE_FORMAT_Z32_FLOAT] = { MALI_R32F, _TZ }, - [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = { MALI_R32F, _TZ }, - [PIPE_FORMAT_X32_S8X24_UINT] = { MALI_R8UI, _T }, - [PIPE_FORMAT_X24S8_UINT] = { MALI_RGBA8UI, _TZ }, - [PIPE_FORMAT_S8_UINT] = { MALI_R8UI, _T }, - - [PIPE_FORMAT_R32_FIXED] = { MALI_R32_FIXED, _V }, - [PIPE_FORMAT_R32G32_FIXED] = { MALI_RG32_FIXED, _V }, - [PIPE_FORMAT_R32G32B32_FIXED] = { MALI_RGB32_FIXED, _V }, - [PIPE_FORMAT_R32G32B32A32_FIXED] = { MALI_RGBA32_FIXED, _V }, - - [PIPE_FORMAT_R11G11B10_FLOAT] = { MALI_R11F_G11F_B10F, _VTR}, - [PIPE_FORMAT_R9G9B9E5_FLOAT] = { MALI_R9F_G9F_B9F_E5F, _VT }, - - [PIPE_FORMAT_R8_SNORM] = { MALI_R8_SNORM, _VT }, - [PIPE_FORMAT_R16_SNORM] = { MALI_R16_SNORM, _VT }, - [PIPE_FORMAT_R32_SNORM] = { MALI_R32_SNORM, _VT }, - [PIPE_FORMAT_R8G8_SNORM] = { MALI_RG8_SNORM, _VT }, - [PIPE_FORMAT_R16G16_SNORM] = { MALI_RG16_SNORM, _VT }, - [PIPE_FORMAT_R32G32_SNORM] = { MALI_RG32_SNORM, _VT }, - [PIPE_FORMAT_R8G8B8_SNORM] = { MALI_RGB8_SNORM, _VT }, - [PIPE_FORMAT_R16G16B16_SNORM] = { MALI_RGB16_SNORM, _VT }, - [PIPE_FORMAT_R32G32B32_SNORM] = { MALI_RGB32_SNORM, _VT }, - [PIPE_FORMAT_R8G8B8A8_SNORM] = { MALI_RGBA8_SNORM, _VT }, - [PIPE_FORMAT_R16G16B16A16_SNORM] = { MALI_RGBA16_SNORM, _VT }, - [PIPE_FORMAT_R32G32B32A32_SNORM] = { MALI_RGBA32_SNORM, _VT }, - - [PIPE_FORMAT_A8_SINT] = { MALI_R8I, _VTR }, - [PIPE_FORMAT_I8_SINT] = { MALI_R8I, _VTR }, - [PIPE_FORMAT_L8_SINT] = { MALI_R8I, _VTR }, - [PIPE_FORMAT_L8A8_SINT] = { MALI_RG8I, _VTR }, - [PIPE_FORMAT_A8_UINT] = { MALI_R8UI, _VTR }, - [PIPE_FORMAT_I8_UINT] = { MALI_R8UI, _VTR }, - [PIPE_FORMAT_L8_UINT] = { MALI_R8UI, _VTR }, - [PIPE_FORMAT_L8A8_UINT] = { MALI_RG8UI, _VTR }, - - [PIPE_FORMAT_A16_SINT] = { MALI_R16I, _VTR }, - [PIPE_FORMAT_I16_SINT] = { MALI_R16I, _VTR }, - [PIPE_FORMAT_L16_SINT] = { MALI_R16I, _VTR }, - [PIPE_FORMAT_L16A16_SINT] = { MALI_RG16I, _VTR }, - [PIPE_FORMAT_A16_UINT] = { MALI_R16UI, _VTR }, - [PIPE_FORMAT_I16_UINT] = { MALI_R16UI, _VTR }, - [PIPE_FORMAT_L16_UINT] = { MALI_R16UI, _VTR }, - [PIPE_FORMAT_L16A16_UINT] = { MALI_RG16UI, _VTR }, - - [PIPE_FORMAT_A32_SINT] = { MALI_R32I, _VTR }, - [PIPE_FORMAT_I32_SINT] = { MALI_R32I, _VTR }, - [PIPE_FORMAT_L32_SINT] = { MALI_R32I, _VTR }, - [PIPE_FORMAT_L32A32_SINT] = { MALI_RG32I, _VTR }, - [PIPE_FORMAT_A32_UINT] = { MALI_R32UI, _VTR }, - [PIPE_FORMAT_I32_UINT] = { MALI_R32UI, _VTR }, - [PIPE_FORMAT_L32_UINT] = { MALI_R32UI, _VTR }, - [PIPE_FORMAT_L32A32_UINT] = { MALI_RG32UI, _VTR }, - - [PIPE_FORMAT_B8G8R8_UINT] = { MALI_RGB8UI, _VTR }, - [PIPE_FORMAT_B8G8R8A8_UINT] = { MALI_RGBA8UI, _VTR }, - [PIPE_FORMAT_B8G8R8_SINT] = { MALI_RGB8I, _VTR }, - [PIPE_FORMAT_B8G8R8A8_SINT] = { MALI_RGBA8I, _VTR }, - [PIPE_FORMAT_A8R8G8B8_UINT] = { MALI_RGBA8UI, _VTR }, - [PIPE_FORMAT_A8B8G8R8_UINT] = { MALI_RGBA8UI, _VTR }, - - [PIPE_FORMAT_R8_UINT] = { MALI_R8UI, _VTR }, - [PIPE_FORMAT_R16_UINT] = { MALI_R16UI, _VTR }, - [PIPE_FORMAT_R32_UINT] = { MALI_R32UI, _VTR }, - [PIPE_FORMAT_R8G8_UINT] = { MALI_RG8UI, _VTR }, - [PIPE_FORMAT_R16G16_UINT] = { MALI_RG16UI, _VTR }, - [PIPE_FORMAT_R32G32_UINT] = { MALI_RG32UI, _VTR }, - [PIPE_FORMAT_R8G8B8_UINT] = { MALI_RGB8UI, _VTR }, - [PIPE_FORMAT_R16G16B16_UINT] = { MALI_RGB16UI, _VTR }, - [PIPE_FORMAT_R32G32B32_UINT] = { MALI_RGB32UI, _VTR }, - [PIPE_FORMAT_R8G8B8A8_UINT] = { MALI_RGBA8UI, _VTR }, - [PIPE_FORMAT_R16G16B16A16_UINT] = { MALI_RGBA16UI, _VTR }, - [PIPE_FORMAT_R32G32B32A32_UINT] = { MALI_RGBA32UI, _VTR }, - - [PIPE_FORMAT_R32_FLOAT] = { MALI_R32F, _VTR }, - [PIPE_FORMAT_R32G32_FLOAT] = { MALI_RG32F, _VTR }, - [PIPE_FORMAT_R32G32B32_FLOAT] = { MALI_RGB32F, _VTR }, - [PIPE_FORMAT_R32G32B32A32_FLOAT] = { MALI_RGBA32F, _VTR }, - - [PIPE_FORMAT_R8_UNORM] = { MALI_R8_UNORM, _VTR }, - [PIPE_FORMAT_R16_UNORM] = { MALI_R16_UNORM, _VTR }, - [PIPE_FORMAT_R32_UNORM] = { MALI_R32_UNORM, _VTR }, - [PIPE_FORMAT_R8G8_UNORM] = { MALI_RG8_UNORM, _VTR }, - [PIPE_FORMAT_R16G16_UNORM] = { MALI_RG16_UNORM, _VTR }, - [PIPE_FORMAT_R32G32_UNORM] = { MALI_RG32_UNORM, _VTR }, - [PIPE_FORMAT_R8G8B8_UNORM] = { MALI_RGB8_UNORM, _VTR }, - [PIPE_FORMAT_R16G16B16_UNORM] = { MALI_RGB16_UNORM, _VTR }, - [PIPE_FORMAT_R32G32B32_UNORM] = { MALI_RGB32_UNORM, _VTR }, - [PIPE_FORMAT_R4G4B4A4_UNORM] = { MALI_RGBA4_UNORM, _VTR }, - [PIPE_FORMAT_R16G16B16A16_UNORM] = { MALI_RGBA16_UNORM, _VTR }, - [PIPE_FORMAT_R32G32B32A32_UNORM] = { MALI_RGBA32_UNORM, _VTR }, - - [PIPE_FORMAT_B8G8R8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_B8G8R8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_A8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_X8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_A8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_X8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_R8G8B8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_R8G8B8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, - - [PIPE_FORMAT_R8G8B8X8_SNORM] = { MALI_RGBA8_SNORM, _VT }, - [PIPE_FORMAT_R8G8B8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_R8G8B8X8_UINT] = { MALI_RGBA8UI, _VTR }, - [PIPE_FORMAT_R8G8B8X8_SINT] = { MALI_RGBA8I, _VTR }, - - [PIPE_FORMAT_L8_UNORM] = { MALI_R8_UNORM, _VTR }, - [PIPE_FORMAT_A8_UNORM] = { MALI_R8_UNORM, _VTR }, - [PIPE_FORMAT_I8_UNORM] = { MALI_R8_UNORM, _VTR }, - [PIPE_FORMAT_L8A8_UNORM] = { MALI_RG8_UNORM, _VTR }, - [PIPE_FORMAT_L16_UNORM] = { MALI_R16_UNORM, _VTR }, - [PIPE_FORMAT_A16_UNORM] = { MALI_R16_UNORM, _VTR }, - [PIPE_FORMAT_I16_UNORM] = { MALI_R16_UNORM, _VTR }, - [PIPE_FORMAT_L16A16_UNORM] = { MALI_RG16_UNORM, _VTR }, - - [PIPE_FORMAT_L8_SNORM] = { MALI_R8_SNORM, _VT }, - [PIPE_FORMAT_A8_SNORM] = { MALI_R8_SNORM, _VT }, - [PIPE_FORMAT_I8_SNORM] = { MALI_R8_SNORM, _VT }, - [PIPE_FORMAT_L8A8_SNORM] = { MALI_RG8_SNORM, _VT }, - [PIPE_FORMAT_L16_SNORM] = { MALI_R16_SNORM, _VT }, - [PIPE_FORMAT_A16_SNORM] = { MALI_R16_SNORM, _VT }, - [PIPE_FORMAT_I16_SNORM] = { MALI_R16_SNORM, _VT }, - [PIPE_FORMAT_L16A16_SNORM] = { MALI_RG16_SNORM, _VT }, - - [PIPE_FORMAT_L16_FLOAT] = { MALI_R16F, _VTR }, - [PIPE_FORMAT_A16_FLOAT] = { MALI_R16F, _VTR }, - [PIPE_FORMAT_I16_FLOAT] = { MALI_RG16F, _VTR }, - [PIPE_FORMAT_L16A16_FLOAT] = { MALI_RG16F, _VTR }, - - [PIPE_FORMAT_L8_SRGB] = { MALI_R8_UNORM, _VTR }, - [PIPE_FORMAT_R8_SRGB] = { MALI_R8_UNORM, _VTR }, - [PIPE_FORMAT_L8A8_SRGB] = { MALI_RG8_UNORM, _VTR }, - [PIPE_FORMAT_R8G8_SRGB] = { MALI_RG8_UNORM, _VTR }, - [PIPE_FORMAT_R8G8B8_SRGB] = { MALI_RGB8_UNORM, _VTR }, - [PIPE_FORMAT_B8G8R8_SRGB] = { MALI_RGB8_UNORM, _VTR }, - [PIPE_FORMAT_R8G8B8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_A8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_X8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_B8G8R8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_B8G8R8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_A8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - [PIPE_FORMAT_X8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, - - [PIPE_FORMAT_R8_SINT] = { MALI_R8I, _VTR }, - [PIPE_FORMAT_R16_SINT] = { MALI_R16I, _VTR }, - [PIPE_FORMAT_R32_SINT] = { MALI_R32I, _VTR }, - [PIPE_FORMAT_R16_FLOAT] = { MALI_R16F, _VTR }, - [PIPE_FORMAT_R8G8_SINT] = { MALI_RG8I, _VTR }, - [PIPE_FORMAT_R16G16_SINT] = { MALI_RG16I, _VTR }, - [PIPE_FORMAT_R32G32_SINT] = { MALI_RG32I, _VTR }, - [PIPE_FORMAT_R16G16_FLOAT] = { MALI_RG16F, _VTR }, - [PIPE_FORMAT_R8G8B8_SINT] = { MALI_RGB8I, _VTR }, - [PIPE_FORMAT_R16G16B16_SINT] = { MALI_RGB16I, _VTR }, - [PIPE_FORMAT_R32G32B32_SINT] = { MALI_RGB32I, _VTR }, - [PIPE_FORMAT_R16G16B16_FLOAT] = { MALI_RGB16F, _VTR }, - [PIPE_FORMAT_R8G8B8A8_SINT] = { MALI_RGBA8I, _VTR }, - [PIPE_FORMAT_R16G16B16A16_SINT] = { MALI_RGBA16I, _VTR }, - [PIPE_FORMAT_R32G32B32A32_SINT] = { MALI_RGBA32I, _VTR }, - [PIPE_FORMAT_R16G16B16A16_FLOAT] = { MALI_RGBA16F, _VTR }, - - [PIPE_FORMAT_R16G16B16X16_UNORM] = { MALI_RGBA16_UNORM, _VTR }, - [PIPE_FORMAT_R16G16B16X16_SNORM] = { MALI_RGBA16_SNORM, _VT }, - [PIPE_FORMAT_R16G16B16X16_FLOAT] = { MALI_RGBA16F, _VTR }, - [PIPE_FORMAT_R16G16B16X16_UINT] = { MALI_RGBA16UI, _VTR }, - [PIPE_FORMAT_R16G16B16X16_SINT] = { MALI_RGBA16I, _VTR }, - - [PIPE_FORMAT_R32G32B32X32_FLOAT] = { MALI_RGBA32F, _VTR }, - [PIPE_FORMAT_R32G32B32X32_UINT] = { MALI_RGBA32UI, _VTR }, - [PIPE_FORMAT_R32G32B32X32_SINT] = { MALI_RGBA32I, _VTR }, -}; - -#undef _VTR -#undef _VT -#undef _V -#undef _T -#undef _R - -/* Is a format encoded like Z24S8 and therefore compatible for render? */ - -bool -panfrost_is_z24s8_variant(enum pipe_format fmt) -{ - switch (fmt) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: - return true; - default: - return false; - } -} - -/* Translate a PIPE swizzle quad to a 12-bit Mali swizzle code. PIPE - * swizzles line up with Mali swizzles for the XYZW01, but PIPE swizzles have - * an additional "NONE" field that we have to mask out to zero. Additionally, - * PIPE swizzles are sparse but Mali swizzles are packed */ - -unsigned -panfrost_translate_swizzle_4(const unsigned char swizzle[4]) -{ - unsigned out = 0; - - for (unsigned i = 0; i < 4; ++i) { - unsigned translated = (swizzle[i] > PIPE_SWIZZLE_1) ? PIPE_SWIZZLE_0 : swizzle[i]; - out |= (translated << (3*i)); - } - - return out; -} - -void -panfrost_invert_swizzle(const unsigned char *in, unsigned char *out) -{ - /* First, default to all zeroes to prevent uninitialized junk */ - - for (unsigned c = 0; c < 4; ++c) - out[c] = PIPE_SWIZZLE_0; - - /* Now "do" what the swizzle says */ - - for (unsigned c = 0; c < 4; ++c) { - unsigned char i = in[c]; - - /* Who cares? */ - assert(PIPE_SWIZZLE_X == 0); - if (i > PIPE_SWIZZLE_W) - continue; - - /* Invert */ - unsigned idx = i - PIPE_SWIZZLE_X; - out[idx] = PIPE_SWIZZLE_X + c; - } -} - -enum mali_format -panfrost_format_to_bifrost_blend(const struct util_format_description *desc) -{ - enum mali_format format = panfrost_pipe_format_table[desc->format].hw; - assert(format); - - switch (format) { - case MALI_RGBA4_UNORM: - return MALI_RGBA4; - case MALI_RGBA8_UNORM: - case MALI_RGB8_UNORM: - return MALI_RGBA8_2; - case MALI_RGB10_A2_UNORM: - return MALI_RGB10_A2_2; - default: - return format; - } -} diff --git a/src/panfrost/encoder/pan_invocation.c b/src/panfrost/encoder/pan_invocation.c deleted file mode 100644 index d86b16a2643..00000000000 --- a/src/panfrost/encoder/pan_invocation.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors (Collabora): - * Alyssa Rosenzweig - * - */ - -#include -#include "util/u_math.h" -#include "pan_encoder.h" - -/* Compute shaders are invoked with a gl_NumWorkGroups X/Y/Z triplet. Vertex - * shaders, it turns out, are invoked with the same mechanism, with the triplet - * (1, vertex_count, instance_count). - * - * Alongside this triplet is the gl_WorkGroupSize X/Y/Z triplet. - * - * Unfortunately, the packing for these triplet into the - * mali_vertex_tiler_prefix is a little funky, using a dynamic bitfield. The - * routines here exist to pack this */ - -void -panfrost_pack_work_groups_compute( - struct mali_vertex_tiler_prefix *out, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z, - bool quirk_graphics) -{ - uint32_t packed = 0; - - /* The values needing packing, in order, and the corresponding shifts. - * Indicies into shift are off-by-one to make the logic easier */ - - unsigned shifts[7] = { 0 }; - - unsigned values[6] = { - MALI_POSITIVE(size_x), - MALI_POSITIVE(size_y), - MALI_POSITIVE(size_z), - MALI_POSITIVE(num_x), - MALI_POSITIVE(num_y), - MALI_POSITIVE(num_z), - }; - - for (unsigned i = 0; i < 6; ++i) { - /* OR it in, shifting as required */ - packed |= (values[i] << shifts[i]); - - /* How many bits did we use? */ - unsigned bit_count = util_logbase2_ceil(values[i] + 1); - - /* Set the next shift accordingly */ - shifts[i + 1] = shifts[i] + bit_count; - } - - /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift - * = 32. This doesn't appear to matter to the hardware, but it's good - * to be bit-identical. */ - - if (quirk_graphics && (num_z <= 1)) - shifts[5] = 32; - - /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2, - * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL - * compute, it is always 2 if no barriers are in use, but is equal to - * workgroups_x_shift is barriers are in use. */ - - unsigned shift_2 = shifts[3]; - - if (quirk_graphics) - shift_2 = MAX2(shift_2, 2); - - /* Pack them in */ - uint32_t packed_shifts = - (shifts[1] << 0) | - (shifts[2] << 5) | - (shifts[3] << 10) | - (shifts[4] << 16) | - (shifts[5] << 22) | - (shift_2 << 28); - - /* Upload the packed bitfields */ - out->invocation_count = packed; - out->invocation_shifts = packed_shifts; - - /* TODO: Compute workgroups_x_shift_3 */ - out->workgroups_x_shift_3 = shift_2; -} - -/* Packs vertex/tiler descriptors simultaneously */ -void -panfrost_pack_work_groups_fused( - struct mali_vertex_tiler_prefix *vertex, - struct mali_vertex_tiler_prefix *tiler, - unsigned num_x, - unsigned num_y, - unsigned num_z, - unsigned size_x, - unsigned size_y, - unsigned size_z) -{ - panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z, true); - - /* Copy results over */ - tiler->invocation_count = vertex->invocation_count; - tiler->invocation_shifts = vertex->invocation_shifts; - - /* Set special fields for each */ - vertex->workgroups_x_shift_3 = 5; - tiler->workgroups_x_shift_3 = 6; -} - diff --git a/src/panfrost/encoder/pan_pool.c b/src/panfrost/encoder/pan_pool.c deleted file mode 100644 index 1a08be2aacf..00000000000 --- a/src/panfrost/encoder/pan_pool.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * © Copyright 2018 Alyssa Rosenzweig - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include "util/hash_table.h" -#include "pan_bo.h" -#include "pan_pool.h" - -/* TODO: What does this actually have to be? */ -#define ALIGNMENT 128 - -/* Transient command stream pooling: command stream uploads try to simply copy - * into whereever we left off. If there isn't space, we allocate a new entry - * into the pool and copy there */ - -struct pan_pool -panfrost_create_pool(void *memctx, struct panfrost_device *dev) -{ - struct pan_pool pool = { - .dev = dev, - .transient_offset = 0, - .transient_bo = NULL - }; - - pool.bos = _mesa_hash_table_create(memctx, _mesa_hash_pointer, - _mesa_key_pointer_equal); - - - return pool; -} - -struct panfrost_transfer -panfrost_pool_alloc(struct pan_pool *pool, size_t sz) -{ - /* Pad the size */ - sz = ALIGN_POT(sz, ALIGNMENT); - - /* Find or create a suitable BO */ - struct panfrost_bo *bo = NULL; - - unsigned offset = 0; - - bool fits_in_current = (pool->transient_offset + sz) < TRANSIENT_SLAB_SIZE; - - if (likely(pool->transient_bo && fits_in_current)) { - /* We can reuse the current BO, so get it */ - bo = pool->transient_bo; - - /* Use the specified offset */ - offset = pool->transient_offset; - pool->transient_offset = offset + sz; - } else { - size_t bo_sz = sz < TRANSIENT_SLAB_SIZE ? - TRANSIENT_SLAB_SIZE : ALIGN_POT(sz, 4096); - - /* We can't reuse the current BO, but we can create a new one. - * We don't know what the BO will be used for, so let's flag it - * RW and attach it to both the fragment and vertex/tiler jobs. - * TODO: if we want fine grained BO assignment we should pass - * flags to this function and keep the read/write, - * fragment/vertex+tiler pools separate. - */ - bo = panfrost_bo_create(pool->dev, bo_sz, 0); - - uintptr_t flags = PAN_BO_ACCESS_PRIVATE | - PAN_BO_ACCESS_RW | - PAN_BO_ACCESS_VERTEX_TILER | - PAN_BO_ACCESS_FRAGMENT; - - _mesa_hash_table_insert(pool->bos, bo, (void *) flags); - - if (sz < TRANSIENT_SLAB_SIZE) { - pool->transient_bo = bo; - pool->transient_offset = offset + sz; - } - } - - struct panfrost_transfer ret = { - .cpu = bo->cpu + offset, - .gpu = bo->gpu + offset, - }; - - return ret; - -} - -mali_ptr -panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz) -{ - struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sz); - memcpy(transfer.cpu, data, sz); - return transfer.gpu; -} diff --git a/src/panfrost/encoder/pan_pool.h b/src/panfrost/encoder/pan_pool.h deleted file mode 100644 index 14593eabd43..00000000000 --- a/src/panfrost/encoder/pan_pool.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * © Copyright 2017-2018 Alyssa Rosenzweig - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef __PAN_POOL_H__ -#define __PAN_POOL_H__ - -#include -#include - -/* Represents a pool of memory that can only grow, used to allocate objects - * with the same lifetime as the pool itself. In OpenGL, a pool is owned by the - * batch for transient structures. In Vulkan, it may be owned by e.g. the - * command pool */ - -struct pan_pool { - /* Parent device for allocation */ - struct panfrost_device *dev; - - /* panfrost_bo -> access_flags owned by the pool */ - struct hash_table *bos; - - /* Current transient BO */ - struct panfrost_bo *transient_bo; - - /* Within the topmost transient BO, how much has been used? */ - unsigned transient_offset; -}; - -struct pan_pool -panfrost_create_pool(void *memctx, struct panfrost_device *dev); - -/* Represents a fat pointer for GPU-mapped memory, returned from the transient - * allocator and not used for much else */ - -struct panfrost_transfer { - uint8_t *cpu; - mali_ptr gpu; -}; - -struct panfrost_transfer -panfrost_pool_alloc(struct pan_pool *pool, size_t sz); - -mali_ptr -panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz); - -#endif diff --git a/src/panfrost/encoder/pan_props.c b/src/panfrost/encoder/pan_props.c deleted file mode 100644 index a4ff28506df..00000000000 --- a/src/panfrost/encoder/pan_props.c +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include - -#include "util/u_math.h" -#include "util/macros.h" -#include "util/hash_table.h" -#include "util/u_thread.h" -#include "drm-uapi/panfrost_drm.h" -#include "pan_encoder.h" -#include "pan_device.h" -#include "panfrost-quirks.h" -#include "pan_bo.h" - -/* Abstraction over the raw drm_panfrost_get_param ioctl for fetching - * information about devices */ - -static __u64 -panfrost_query_raw( - int fd, - enum drm_panfrost_param param, - bool required, - unsigned default_value) -{ - struct drm_panfrost_get_param get_param = {0,}; - ASSERTED int ret; - - get_param.param = param; - ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); - - if (ret) { - assert(!required); - return default_value; - } - - return get_param.value; -} - -unsigned -panfrost_query_gpu_version(int fd) -{ - return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); -} - -unsigned -panfrost_query_core_count(int fd) -{ - /* On older kernels, worst-case to 16 cores */ - - unsigned mask = panfrost_query_raw(fd, - DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); - - return util_bitcount(mask); -} - -unsigned -panfrost_query_thread_tls_alloc(int fd) -{ - /* On older kernels, we worst-case to 256 threads, the architectural - * maximum for Midgard. On my current kernel/hardware, I'm seeing this - * readback as 0, so we'll worst-case there too */ - - unsigned tls = panfrost_query_raw(fd, - DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 256); - - if (tls) - return tls; - else - return 256; -} - -static uint32_t -panfrost_query_compressed_formats(int fd) -{ - /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and - * should exist on any Mali configuration. All hardware should report - * these texture formats but the kernel might not be new enough. */ - - uint32_t default_set = - (1 << MALI_ETC2_RGB8) | - (1 << MALI_ETC2_R11_UNORM) | - (1 << MALI_ETC2_RGBA8) | - (1 << MALI_ETC2_RG11_UNORM) | - (1 << MALI_ETC2_R11_SNORM) | - (1 << MALI_ETC2_RG11_SNORM) | - (1 << MALI_ETC2_RGB8A1) | - (1 << MALI_ASTC_3D_LDR) | - (1 << MALI_ASTC_3D_HDR) | - (1 << MALI_ASTC_2D_LDR) | - (1 << MALI_ASTC_2D_HDR); - - return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, - false, default_set); -} - -/* DRM_PANFROST_PARAM_TEXTURE_FEATURES0 will return a bitmask of supported - * compressed formats, so we offer a helper to test if a format is supported */ - -bool -panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt) -{ - if (MALI_EXTRACT_TYPE(fmt) != MALI_FORMAT_COMPRESSED) - return true; - - unsigned idx = fmt & ~MALI_FORMAT_COMPRESSED; - assert(idx < 32); - - return dev->compressed_formats & (1 << idx); -} - -/* Given a GPU ID like 0x860, return a prettified model name */ - -const char * -panfrost_model_name(unsigned gpu_id) -{ - switch (gpu_id) { - case 0x600: return "Mali T600 (Panfrost)"; - case 0x620: return "Mali T620 (Panfrost)"; - case 0x720: return "Mali T720 (Panfrost)"; - case 0x820: return "Mali T820 (Panfrost)"; - case 0x830: return "Mali T830 (Panfrost)"; - case 0x750: return "Mali T760 (Panfrost)"; - case 0x860: return "Mali T860 (Panfrost)"; - case 0x880: return "Mali T880 (Panfrost)"; - case 0x7093: return "Mali G31 (Panfrost)"; - case 0x7212: return "Mali G52 (Panfrost)"; - default: - unreachable("Invalid GPU ID"); - } -} - -void -panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) -{ - dev->fd = fd; - dev->memctx = memctx; - dev->gpu_id = panfrost_query_gpu_version(fd); - dev->core_count = panfrost_query_core_count(fd); - dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd); - dev->kernel_version = drmGetVersion(fd); - dev->quirks = panfrost_get_quirks(dev->gpu_id); - dev->compressed_formats = panfrost_query_compressed_formats(fd); - - util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512); - - pthread_mutex_init(&dev->bo_cache.lock, NULL); - list_inithead(&dev->bo_cache.lru); - - for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) - list_inithead(&dev->bo_cache.buckets[i]); -} - -void -panfrost_close_device(struct panfrost_device *dev) -{ - panfrost_bo_unreference(dev->blit_shaders.bo); - panfrost_bo_cache_evict_all(dev); - pthread_mutex_destroy(&dev->bo_cache.lock); - drmFreeVersion(dev->kernel_version); - util_sparse_array_finish(&dev->bo_map); - -} diff --git a/src/panfrost/encoder/pan_sampler.c b/src/panfrost/encoder/pan_sampler.c deleted file mode 100644 index 63ddd17b816..00000000000 --- a/src/panfrost/encoder/pan_sampler.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include "pan_encoder.h" - -/* Sampler comparison functions are flipped in OpenGL from the hardware, so we - * need to be able to flip accordingly */ - -enum mali_func -panfrost_flip_compare_func(enum mali_func f) -{ - switch (f) { - case MALI_FUNC_LESS: - return MALI_FUNC_GREATER; - case MALI_FUNC_GREATER: - return MALI_FUNC_LESS; - case MALI_FUNC_LEQUAL: - return MALI_FUNC_GEQUAL; - case MALI_FUNC_GEQUAL: - return MALI_FUNC_LEQUAL; - default: - return f; - } -} diff --git a/src/panfrost/encoder/pan_scoreboard.c b/src/panfrost/encoder/pan_scoreboard.c deleted file mode 100644 index c72c9a37c3a..00000000000 --- a/src/panfrost/encoder/pan_scoreboard.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include -#include "pan_scoreboard.h" -#include "pan_device.h" -#include "panfrost-quirks.h" - -/* - * There are various types of Mali jobs: - * - * - WRITE_VALUE: generic write primitive, used to zero tiler field - * - VERTEX: runs a vertex shader - * - TILER: runs tiling and sets up a fragment shader - * - FRAGMENT: runs fragment shaders and writes out - * - COMPUTE: runs a compute shader - * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost) - * - GEOMETRY: runs a geometry shader (unimplemented) - * - CACHE_FLUSH: unseen in the wild, theoretically cache flush - * - * In between a full batch and a single Mali job is the "job chain", a series - * of Mali jobs together forming a linked list. Within the job chain, each Mali - * job can set (up to) two dependencies on other earlier jobs in the chain. - * This dependency graph forms a scoreboard. The general idea of a scoreboard - * applies: when there is a data dependency of job B on job A, job B sets one - * of its dependency indices to job A, ensuring that job B won't start until - * job A finishes. - * - * More specifically, here are a set of rules: - * - * - A write value job must appear if and only if there is at least one tiler - * job, and tiler jobs must depend on it. - * - * - Vertex jobs and tiler jobs are independent. - * - * - A tiler job must have a dependency on its data source. If it's getting - * data from a vertex job, it depends on the vertex job. If it's getting data - * from software, this is null. - * - * - Tiler jobs must depend on the write value job (chained or otherwise). - * - * - Tiler jobs must be strictly ordered. So each tiler job must depend on the - * previous job in the chain. - * - * - Jobs linking via next_job has no bearing on order of execution, rather it - * just establishes the linked list of jobs, EXCEPT: - * - * - A job's dependencies must appear earlier in the linked list (job chain). - * - * Justification for each rule: - * - * - Write value jobs are used to write a zero into a magic tiling field, which - * enables tiling to work. If tiling occurs, they are needed; if it does not, - * we cannot emit them since then tiling partially occurs and it's bad. - * - * - The hardware has no notion of a "vertex/tiler job" (at least not our - * hardware -- other revs have fused jobs, but --- crap, this just got even - * more complicated). They are independent units that take in data, process - * it, and spit out data. - * - * - Any job must depend on its data source, in fact, or risk a - * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo - * tiler jobs depend on the corresponding vertex job (if it's there). - * - * - The tiler is not thread-safe; this dependency prevents race conditions - * between two different jobs trying to write to the tiler outputs at the - * same time. - * - * - Internally, jobs are scoreboarded; the next job fields just form a linked - * list to allow the jobs to be read in; the execution order is from - * resolving the dependency fields instead. - * - * - The hardware cannot set a dependency on a job it doesn't know about yet, - * and dependencies are processed in-order of the next job fields. - * - */ - -/* Generates, uploads, and queues a a new job. All fields are written in order - * except for next_job accounting (TODO: Should we be clever and defer the - * upload of the header here until next job to keep the access pattern totally - * linear? Or is that just a micro op at this point?). Returns the generated - * index for dep management. - * - * Inject is used to inject a job at the front, for wallpapering. If you are - * not wallpapering and set this, dragons will eat you. */ - -unsigned -panfrost_new_job( - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - enum mali_job_type type, - bool barrier, - unsigned local_dep, - void *payload, size_t payload_size, - bool inject) -{ - unsigned global_dep = 0; - - if (type == JOB_TYPE_TILER) { - /* Tiler jobs must be chained, and on Midgard, the first tiler - * job must depend on the write value job, whose index we - * reserve now */ - - if (scoreboard->tiler_dep) - global_dep = scoreboard->tiler_dep; - else if (!(pool->dev->quirks & IS_BIFROST)) { - scoreboard->write_value_index = ++scoreboard->job_index; - global_dep = scoreboard->write_value_index; - } - } - - /* Assign the index */ - unsigned index = ++scoreboard->job_index; - - struct mali_job_descriptor_header job = { - .job_descriptor_size = 1, - .job_type = type, - .job_barrier = barrier, - .job_index = index, - .job_dependency_index_1 = local_dep, - .job_dependency_index_2 = global_dep, - }; - - if (inject) - job.next_job = scoreboard->first_job; - - struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + payload_size); - memcpy(transfer.cpu, &job, sizeof(job)); - memcpy(transfer.cpu + sizeof(job), payload, payload_size); - - if (inject) { - scoreboard->first_job = transfer.gpu; - return index; - } - - /* Form a chain */ - if (type == JOB_TYPE_TILER) - scoreboard->tiler_dep = index; - - if (scoreboard->prev_job) - scoreboard->prev_job->next_job = transfer.gpu; - else - scoreboard->first_job = transfer.gpu; - - scoreboard->prev_job = (struct mali_job_descriptor_header *) transfer.cpu; - return index; -} - -/* Generates a write value job, used to initialize the tiler structures. Note - * this is called right before frame submission. */ - -void -panfrost_scoreboard_initialize_tiler(struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr polygon_list) -{ - /* Check if we even need tiling */ - if (pool->dev->quirks & IS_BIFROST || !scoreboard->tiler_dep) - return; - - /* Okay, we do. Let's generate it. We'll need the job's polygon list - * regardless of size. */ - - struct mali_job_descriptor_header job = { - .job_type = JOB_TYPE_WRITE_VALUE, - .job_index = scoreboard->write_value_index, - .job_descriptor_size = 1, - .next_job = scoreboard->first_job - }; - - struct mali_payload_write_value payload = { - .address = polygon_list, - .value_descriptor = MALI_WRITE_VALUE_ZERO, - }; - - struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + sizeof(payload)); - memcpy(transfer.cpu, &job, sizeof(job)); - memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); - - scoreboard->first_job = transfer.gpu; -} diff --git a/src/panfrost/encoder/pan_scoreboard.h b/src/panfrost/encoder/pan_scoreboard.h deleted file mode 100644 index 71667d4b5de..00000000000 --- a/src/panfrost/encoder/pan_scoreboard.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (C) 2019-2020 Collabora Ltd. - * Copyright (C) 2019 Alyssa Rosenzweig - * Copyright (C) 2014-2017 Broadcom - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef __PAN_SCOREBOARD_H__ -#define __PAN_SCOREBOARD_H__ - -#include "panfrost-job.h" -#include "pan_pool.h" - -struct pan_scoreboard { - /* The first job in the batch */ - mali_ptr first_job; - - /* The number of jobs in the primary batch, essentially */ - unsigned job_index; - - /* A CPU-side pointer to the previous job for next_job linking */ - struct mali_job_descriptor_header *prev_job; - - /* The dependency for tiler jobs (i.e. the index of the last emitted - * tiler job, or zero if none have been emitted) */ - unsigned tiler_dep; - - /* The job index of the WRITE_VALUE job (before it has been created) */ - unsigned write_value_index; -}; - -unsigned -panfrost_new_job( - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - enum mali_job_type type, - bool barrier, - unsigned local_dep, - void *payload, size_t payload_size, - bool inject); - -void panfrost_scoreboard_initialize_tiler( - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr polygon_list); - -#endif diff --git a/src/panfrost/encoder/pan_scratch.c b/src/panfrost/encoder/pan_scratch.c deleted file mode 100644 index 478a788b116..00000000000 --- a/src/panfrost/encoder/pan_scratch.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include "util/u_math.h" -#include "pan_encoder.h" - -/* Midgard has a small register file, so shaders with high register pressure - * need to spill from the register file onto the stack. In addition to - * spilling, it is desireable to allocate temporary arrays on the stack (for - * instance because the register file does not support indirect access but the - * stack does). - * - * The stack is located in "Thread Local Storage", sometimes abbreviated TLS in - * the kernel source code. Thread local storage is allocated per-thread, - * per-core, so threads executing concurrently do not interfere with each - * other's stacks. On modern kernels, we may query - * DRM_PANFROST_PARAM_THREAD_TLS_ALLOC for the number of threads per core we - * must allocate for, and DRM_PANFROST_PARAM_SHADER_PRESENT for a bitmask of - * shader cores (so take a popcount of that mask for the number of shader - * cores). On older kernels that do not support querying these values, - * following kbase, we may use the worst-case value of 256 threads for - * THREAD_TLS_ALLOC, and the worst-case value of 16 cores for Midgard per the - * "shader core count" column of the implementations table in - * https://en.wikipedia.org/wiki/Mali_%28GPU% [citation needed] - * - * Within a particular thread, there is stack allocated. If it is present, its - * size is a power-of-two, and it is at least 16 bytes. Stack is allocated - * with the shared memory descriptor used for all shaders within a frame (note - * that they don't execute concurrently so it's fine). So, consider the maximum - * stack size used by any shader within a job, and then compute (where npot - * denotes the next power of two): - * - * bytes/thread = npot(max(size, 16)) - * allocated = (# of bytes/thread) * (# of threads/core) * (# of cores) - * - * The size of Thread Local Storage is signaled to the GPU in a dedicated - * log_stack_size field. Since stack sizes are powers of two, it follows that - * stack_size is logarithmic. Consider some sample values: - * - * stack size | log_stack_size - * --------------------------- - * 256 | 4 - * 512 | 5 - * 1024 | 6 - * - * Noting that log2(256) = 8, we have the relation: - * - * stack_size <= 2^(log_stack_size + 4) - * - * Given the constraints about powers-of-two and the minimum of 256, we thus - * derive a formula for log_stack_size in terms of stack size (s), where s is - * positive: - * - * log_stack_size = ceil(log2(max(s, 16))) - 4 - * - * There are other valid characterisations of this formula, of course, but this - * is computationally simple, so good enough for our purposes. If s=0, since - * there is no spilling used whatsoever, we may set log_stack_size to 0 to - * disable the stack. - */ - -/* Computes log_stack_size = ceil(log2(max(s, 16))) - 4 */ - -unsigned -panfrost_get_stack_shift(unsigned stack_size) -{ - if (stack_size) - return util_logbase2_ceil(MAX2(stack_size, 16)) - 4; - else - return 0; -} - -/* Computes the aligned stack size given the shift and thread count. The blob - * reserves an extra page, and since this is hardware-internal, we do too. */ - -unsigned -panfrost_get_total_stack_size( - unsigned stack_shift, - unsigned threads_per_core, - unsigned core_count) -{ - unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32); - unsigned size = size_per_thread * threads_per_core * core_count; - - return size + 4096; -} diff --git a/src/panfrost/encoder/pan_texture.c b/src/panfrost/encoder/pan_texture.c deleted file mode 100644 index da436ea7318..00000000000 --- a/src/panfrost/encoder/pan_texture.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (C) 2008 VMware, Inc. - * Copyright (C) 2014 Broadcom - * Copyright (C) 2018-2019 Alyssa Rosenzweig - * Copyright (C) 2019-2020 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#include "util/macros.h" -#include "util/u_math.h" -#include "pan_texture.h" - -/* Generates a texture descriptor. Ideally, descriptors are immutable after the - * texture is created, so we can keep these hanging around in GPU memory in a - * dedicated BO and not have to worry. In practice there are some minor gotchas - * with this (the driver sometimes will change the format of a texture on the - * fly for compression) but it's fast enough to just regenerate the descriptor - * in those cases, rather than monkeypatching at drawtime. - * - * A texture descriptor consists of a 32-byte mali_texture_descriptor structure - * followed by a variable number of pointers. Due to this variance and - * potentially large size, we actually upload directly rather than returning - * the descriptor. Whether the user does a copy themselves or not is irrelevant - * to us here. - */ - -/* Check if we need to set a custom stride by computing the "expected" - * stride and comparing it to what the user actually wants. Only applies - * to linear textures, since tiled/compressed textures have strict - * alignment requirements for their strides as it is */ - -static bool -panfrost_needs_explicit_stride( - struct panfrost_slice *slices, - uint16_t width, - unsigned first_level, unsigned last_level, - unsigned bytes_per_pixel) -{ - for (unsigned l = first_level; l <= last_level; ++l) { - unsigned actual = slices[l].stride; - unsigned expected = u_minify(width, l) * bytes_per_pixel; - - if (actual != expected) - return true; - } - - return false; -} - -/* A Scalable Texture Compression (ASTC) corresponds to just a few texture type - * in the hardware, but in fact can be parametrized to have various widths and - * heights for the so-called "stretch factor". It turns out these parameters - * are stuffed in the bottom bits of the payload pointers. This functions - * computes these magic stuffing constants based on the ASTC format in use. The - * constant in a given dimension is 3-bits, and two are stored side-by-side for - * each active dimension. - */ - -static unsigned -panfrost_astc_stretch(unsigned dim) -{ - assert(dim >= 4 && dim <= 12); - return MIN2(dim, 11) - 4; -} - -/* Texture addresses are tagged with information about compressed formats. - * AFBC uses a bit for whether the colorspace transform is enabled (RGB and - * RGBA only). - * For ASTC, this is a "stretch factor" encoding the block size. */ - -static unsigned -panfrost_compression_tag( - const struct util_format_description *desc, - enum mali_format format, enum mali_texture_layout layout) -{ - if (layout == MALI_TEXTURE_AFBC) - return desc->nr_channels >= 3; - else if (format == MALI_ASTC_2D_LDR || format == MALI_ASTC_2D_HDR) - return (panfrost_astc_stretch(desc->block.height) << 3) | - panfrost_astc_stretch(desc->block.width); - else - return 0; -} - - -/* Cubemaps have 6 faces as "layers" in between each actual layer. We - * need to fix this up. TODO: logic wrong in the asserted out cases ... - * can they happen, perhaps from cubemap arrays? */ - -static void -panfrost_adjust_cube_dimensions( - unsigned *first_face, unsigned *last_face, - unsigned *first_layer, unsigned *last_layer) -{ - *first_face = *first_layer % 6; - *last_face = *last_layer % 6; - *first_layer /= 6; - *last_layer /= 6; - - assert((*first_layer == *last_layer) || (*first_face == 0 && *last_face == 5)); -} - -/* Following the texture descriptor is a number of pointers. How many? */ - -static unsigned -panfrost_texture_num_elements( - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - bool is_cube, bool manual_stride) -{ - unsigned first_face = 0, last_face = 0; - - if (is_cube) { - panfrost_adjust_cube_dimensions(&first_face, &last_face, - &first_layer, &last_layer); - } - - unsigned levels = 1 + last_level - first_level; - unsigned layers = 1 + last_layer - first_layer; - unsigned faces = 1 + last_face - first_face; - unsigned num_elements = levels * layers * faces * MAX2(nr_samples, 1); - - if (manual_stride) - num_elements *= 2; - - return num_elements; -} - -/* Conservative estimate of the size of the texture payload a priori. - * Average case, size equal to the actual size. Worst case, off by 2x (if - * a manual stride is not needed on a linear texture). Returned value - * must be greater than or equal to the actual size, so it's safe to use - * as an allocation amount */ - -unsigned -panfrost_estimate_texture_payload_size( - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - enum mali_texture_type type, enum mali_texture_layout layout) -{ - /* Assume worst case */ - unsigned manual_stride = (layout == MALI_TEXTURE_LINEAR); - - unsigned elements = panfrost_texture_num_elements( - first_level, last_level, - first_layer, last_layer, - nr_samples, - type == MALI_TEX_CUBE, manual_stride); - - return sizeof(mali_ptr) * elements; -} - -/* Bifrost requires a tile stride for tiled textures. This stride is computed - * as (16 * bpp * width) assuming there is at least one tile (width >= 16). - * Otherwise if height <= 16, the blob puts zero. Interactions with AFBC are - * currently unknown. - */ - -static unsigned -panfrost_nonlinear_stride(enum mali_texture_layout layout, - unsigned bytes_per_pixel, - unsigned width, - unsigned height) -{ - if (layout == MALI_TEXTURE_TILED) { - return (height <= 16) ? 0 : (16 * bytes_per_pixel * ALIGN_POT(width, 16)); - } else { - unreachable("TODO: AFBC on Bifrost"); - } -} - -static void -panfrost_emit_texture_payload( - mali_ptr *payload, - const struct util_format_description *desc, - enum mali_format mali_format, - enum mali_texture_type type, - enum mali_texture_layout layout, - unsigned width, unsigned height, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - unsigned cube_stride, - bool manual_stride, - mali_ptr base, - struct panfrost_slice *slices) -{ - base |= panfrost_compression_tag(desc, mali_format, layout); - - /* Inject the addresses in, interleaving array indices, mip levels, - * cube faces, and strides in that order */ - - unsigned first_face = 0, last_face = 0, face_mult = 1; - - if (type == MALI_TEX_CUBE) { - face_mult = 6; - panfrost_adjust_cube_dimensions(&first_face, &last_face, &first_layer, &last_layer); - } - - nr_samples = MAX2(nr_samples, 1); - - unsigned idx = 0; - - for (unsigned w = first_layer; w <= last_layer; ++w) { - for (unsigned l = first_level; l <= last_level; ++l) { - for (unsigned f = first_face; f <= last_face; ++f) { - for (unsigned s = 0; s < nr_samples; ++s) { - payload[idx++] = base + panfrost_texture_offset( - slices, type == MALI_TEX_3D, - cube_stride, l, w * face_mult + f, s); - - if (manual_stride) { - payload[idx++] = (layout == MALI_TEXTURE_LINEAR) ? - slices[l].stride : - panfrost_nonlinear_stride(layout, - MAX2(desc->block.bits / 8, 1), - u_minify(width, l), - u_minify(height, l)); - } - } - } - } - } -} - -#define MALI_SWIZZLE_R001 \ - (MALI_CHANNEL_RED << 0) | \ - (MALI_CHANNEL_ZERO << 3) | \ - (MALI_CHANNEL_ZERO << 6) | \ - (MALI_CHANNEL_ONE << 9) - -#define MALI_SWIZZLE_A001 \ - (MALI_CHANNEL_ALPHA << 0) | \ - (MALI_CHANNEL_ZERO << 3) | \ - (MALI_CHANNEL_ZERO << 6) | \ - (MALI_CHANNEL_ONE << 9) - - -void -panfrost_new_texture( - void *out, - uint16_t width, uint16_t height, - uint16_t depth, uint16_t array_size, - enum pipe_format format, - enum mali_texture_type type, - enum mali_texture_layout layout, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - unsigned cube_stride, - unsigned swizzle, - mali_ptr base, - struct panfrost_slice *slices) -{ - const struct util_format_description *desc = - util_format_description(format); - - unsigned bytes_per_pixel = util_format_get_blocksize(format); - - enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw; - assert(mali_format); - - bool manual_stride = (layout == MALI_TEXTURE_LINEAR) - && panfrost_needs_explicit_stride(slices, width, - first_level, last_level, bytes_per_pixel); - - struct mali_texture_descriptor descriptor = { - .width = MALI_POSITIVE(u_minify(width, first_level)), - .height = MALI_POSITIVE(u_minify(height, first_level)), - .depth = MALI_POSITIVE(u_minify(depth, first_level)), - .array_size = MALI_POSITIVE(array_size), - .format = { - .swizzle = (format == PIPE_FORMAT_X24S8_UINT) ? - MALI_SWIZZLE_A001 : - (format == PIPE_FORMAT_S8_UINT) ? - MALI_SWIZZLE_R001 : - panfrost_translate_swizzle_4(desc->swizzle), - .format = mali_format, - .srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB), - .type = type, - .layout = layout, - .manual_stride = manual_stride, - .unknown2 = 1, - }, - .levels = last_level - first_level, - .swizzle = swizzle - }; - - memcpy(out, &descriptor, sizeof(descriptor)); - - mali_ptr *payload = (mali_ptr *) (out + sizeof(struct mali_texture_descriptor)); - panfrost_emit_texture_payload( - payload, - desc, - mali_format, - type, - layout, - width, height, - first_level, last_level, - first_layer, last_layer, - nr_samples, - cube_stride, - manual_stride, - base, - slices); -} - -void -panfrost_new_texture_bifrost( - struct bifrost_texture_descriptor *descriptor, - uint16_t width, uint16_t height, - uint16_t depth, uint16_t array_size, - enum pipe_format format, - enum mali_texture_type type, - enum mali_texture_layout layout, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - unsigned cube_stride, - unsigned swizzle, - mali_ptr base, - struct panfrost_slice *slices, - struct panfrost_bo *payload) -{ - const struct util_format_description *desc = - util_format_description(format); - - enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw; - assert(mali_format); - - panfrost_emit_texture_payload( - (mali_ptr *) payload->cpu, - desc, - mali_format, - type, - layout, - width, height, - first_level, last_level, - first_layer, last_layer, - nr_samples, - cube_stride, - true, /* Stride explicit on Bifrost */ - base, - slices); - - descriptor->format_unk = 0x2; - descriptor->type = type; - descriptor->format = mali_format; - descriptor->srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); - descriptor->format_unk3 = 0x0; - descriptor->width = MALI_POSITIVE(u_minify(width, first_level)); - descriptor->height = MALI_POSITIVE(u_minify(height, first_level)); - descriptor->swizzle = swizzle; - descriptor->layout = layout; - descriptor->levels = last_level - first_level; - descriptor->unk1 = 0x0; - descriptor->levels_unk = 0; - descriptor->level_2 = last_level - first_level; - descriptor->payload = payload->gpu; - descriptor->array_size = MALI_POSITIVE(array_size); - descriptor->unk4 = 0x0; - descriptor->depth = MALI_POSITIVE(u_minify(depth, first_level)); - descriptor->unk5 = 0x0; -} - -/* Computes sizes for checksumming, which is 8 bytes per 16x16 tile. - * Checksumming is believed to be a CRC variant (CRC64 based on the size?). - * This feature is also known as "transaction elimination". */ - -#define CHECKSUM_TILE_WIDTH 16 -#define CHECKSUM_TILE_HEIGHT 16 -#define CHECKSUM_BYTES_PER_TILE 8 - -unsigned -panfrost_compute_checksum_size( - struct panfrost_slice *slice, - unsigned width, - unsigned height) -{ - unsigned aligned_width = ALIGN_POT(width, CHECKSUM_TILE_WIDTH); - unsigned aligned_height = ALIGN_POT(height, CHECKSUM_TILE_HEIGHT); - - unsigned tile_count_x = aligned_width / CHECKSUM_TILE_WIDTH; - unsigned tile_count_y = aligned_height / CHECKSUM_TILE_HEIGHT; - - slice->checksum_stride = tile_count_x * CHECKSUM_BYTES_PER_TILE; - - return slice->checksum_stride * tile_count_y; -} - -unsigned -panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level) -{ - return is_3d ? slices[level].size0 : cube_stride; -} - -/* Computes the offset into a texture at a particular level/face. Add to - * the base address of a texture to get the address to that level/face */ - -unsigned -panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample) -{ - unsigned layer_stride = panfrost_get_layer_stride(slices, is_3d, cube_stride, level); - return slices[level].offset + (face * layer_stride) + (sample * slices[level].size0); -} diff --git a/src/panfrost/encoder/pan_texture.h b/src/panfrost/encoder/pan_texture.h deleted file mode 100644 index c4a07d15ad2..00000000000 --- a/src/panfrost/encoder/pan_texture.h +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (C) 2008 VMware, Inc. - * Copyright (C) 2014 Broadcom - * Copyright (C) 2018-2019 Alyssa Rosenzweig - * Copyright (C) 2019-2020 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#ifndef __PAN_TEXTURE_H -#define __PAN_TEXTURE_H - -#include -#include "util/format/u_format.h" -#include "compiler/shader_enums.h" -#include "panfrost-job.h" -#include "pan_bo.h" - -struct panfrost_slice { - unsigned offset; - unsigned stride; - unsigned size0; - - /* If there is a header preceding each slice, how big is - * that header? Used for AFBC */ - unsigned header_size; - - /* If checksumming is enabled following the slice, what - * is its offset/stride? */ - unsigned checksum_offset; - unsigned checksum_stride; - struct panfrost_bo *checksum_bo; - - /* Has anything been written to this slice? */ - bool initialized; -}; - -struct pan_image { - /* Format and size */ - uint16_t width0, height0, depth0, array_size; - enum pipe_format format; - enum mali_texture_type type; - unsigned first_level, last_level; - unsigned first_layer, last_layer; - unsigned nr_samples; - struct panfrost_bo *bo; - struct panfrost_slice *slices; - unsigned cubemap_stride; - enum mali_texture_layout layout; -}; - -unsigned -panfrost_compute_checksum_size( - struct panfrost_slice *slice, - unsigned width, - unsigned height); - -/* AFBC */ - -bool -panfrost_format_supports_afbc(enum pipe_format format); - -unsigned -panfrost_afbc_header_size(unsigned width, unsigned height); - -/* mali_texture_descriptor */ - -unsigned -panfrost_estimate_texture_payload_size( - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - enum mali_texture_type type, enum mali_texture_layout layout); - -void -panfrost_new_texture( - void *out, - uint16_t width, uint16_t height, - uint16_t depth, uint16_t array_size, - enum pipe_format format, - enum mali_texture_type type, - enum mali_texture_layout layout, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - unsigned cube_stride, - unsigned swizzle, - mali_ptr base, - struct panfrost_slice *slices); - -void -panfrost_new_texture_bifrost( - struct bifrost_texture_descriptor *descriptor, - uint16_t width, uint16_t height, - uint16_t depth, uint16_t array_size, - enum pipe_format format, - enum mali_texture_type type, - enum mali_texture_layout layout, - unsigned first_level, unsigned last_level, - unsigned first_layer, unsigned last_layer, - unsigned nr_samples, - unsigned cube_stride, - unsigned swizzle, - mali_ptr base, - struct panfrost_slice *slices, - struct panfrost_bo *payload); - - -unsigned -panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level); - -unsigned -panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample); - -/* Formats */ - -struct panfrost_format { - enum mali_format hw; - unsigned bind; -}; - -extern struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT]; - -bool -panfrost_is_z24s8_variant(enum pipe_format fmt); - -unsigned -panfrost_translate_swizzle_4(const unsigned char swizzle[4]); - -void -panfrost_invert_swizzle(const unsigned char *in, unsigned char *out); - -static inline unsigned -panfrost_get_default_swizzle(unsigned components) -{ - switch (components) { - case 1: - return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_ZERO << 3) | - (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9); - case 2: - return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | - (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9); - case 3: - return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | - (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ONE << 9); - case 4: - return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | - (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9); - default: - unreachable("Invalid number of components"); - } -} - -static inline unsigned -panfrost_bifrost_swizzle(unsigned components) -{ - /* Set all components to 0 and force w if needed */ - return components < 4 ? 0x10 : 0x00; -} - -enum mali_format -panfrost_format_to_bifrost_blend(const struct util_format_description *desc); - -struct pan_pool; -struct pan_scoreboard; - -void -panfrost_init_blit_shaders(struct panfrost_device *dev); - -void -panfrost_load_midg( - struct pan_pool *pool, - struct pan_scoreboard *scoreboard, - mali_ptr blend_shader, - mali_ptr fbd, - mali_ptr coordinates, unsigned vertex_count, - struct pan_image *image, - unsigned loc); - -#endif diff --git a/src/panfrost/encoder/pan_tiler.c b/src/panfrost/encoder/pan_tiler.c deleted file mode 100644 index fc42724a1e5..00000000000 --- a/src/panfrost/encoder/pan_tiler.c +++ /dev/null @@ -1,373 +0,0 @@ -/* - * Copyright (C) 2019 Collabora, Ltd. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Authors: - * Alyssa Rosenzweig - */ - -#include "util/u_math.h" -#include "util/macros.h" -#include "pan_encoder.h" - -/* Mali GPUs are tiled-mode renderers, rather than immediate-mode. - * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run. - * Then, a fixed-function hardware block (the tiler) consumes the gl_Position - * results. For each triangle specified, it marks each containing tile as - * containing that triangle. This set of "triangles per tile" form the "polygon - * list". Finally, the rasterization unit consumes the polygon list to invoke - * the fragment shader. - * - * In practice, it's a bit more complicated than this. On Midgard chips with an - * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical - * tile size, but Midgard features "hierarchical tiling", where power-of-two - * multiples of the base tile size can be used: hierarchy level 0 (16x16), - * level 1 (32x32), level 2 (64x64), per public information about Midgard's - * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice - * 128x128 is the largest usually used (though higher modes are enabled). The - * idea behind hierarchical tiling is to use low tiling levels for small - * triangles and high levels for large triangles, to minimize memory bandwidth - * and repeated fragment shader invocations (the former issue inherent to - * immediate-mode rendering and the latter common in traditional tilers). - * - * The tiler itself works by reading varyings in and writing a polygon list - * out. Unfortunately (for us), both of these buffers are managed in main - * memory; although they ideally will be cached, it is the drivers' - * responsibility to allocate these buffers. Varying buffer allocation is - * handled elsewhere, as it is not tiler specific; the real issue is allocating - * the polygon list. - * - * This is hard, because from the driver's perspective, we have no information - * about what geometry will actually look like on screen; that information is - * only gained from running the vertex shader. (Theoretically, we could run the - * vertex shaders in software as a prepass, or in hardware with transform - * feedback as a prepass, but either idea is ludicrous on so many levels). - * - * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list - * into three distinct pieces. First, the driver statically determines which - * tile hierarchy levels to use (more on that later). At this point, we know the - * framebuffer dimensions and all the possible tilings of the framebuffer, so - * we know exactly how many tiles exist across all hierarchy levels. The first - * piece of the polygon list is the header, which is exactly 8 bytes per tile, - * plus padding and a small 64-byte prologue. (If that doesn't remind you of - * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is - * the polygon list body, which seems to contain 512 bytes per tile, again - * across every level of the hierarchy. These two parts form the polygon list - * buffer. This buffer has a statically determinable size, approximately equal - * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus - * alignment / minimum restrictions / etc. - * - * The third piece is the easy one (for us): the tiler heap. In essence, the - * tiler heap is a gigantic slab that's as big as could possibly be necessary - * in the worst case imaginable. Just... a gigantic allocation that we give a - * start and end pointer to. What's the catch? The tiler heap is lazily - * allocated; that is, a huge amount of memory is _reserved_, but only a tiny - * bit is actually allocated upfront. The GPU just keeps using the - * unallocated-but-reserved portions as it goes along, generating page faults - * if it goes beyond the allocation, and then the kernel is instructed to - * expand the allocation on page fault (known in the vendor kernel as growable - * memory). This is quite a bit of bookkeeping of its own, but that task is - * pushed to kernel space and we can mostly ignore it here, just remembering to - * set the GROWABLE flag so the kernel actually uses this path rather than - * allocating a gigantic amount up front and burning a hole in RAM. - * - * As far as determining which hierarchy levels to use, the simple answer is - * that right now, we don't. In the tiler configuration fields (consistent from - * the earliest Midgard's SFBD through the latest Bifrost traces we have), - * there is a hierarchy_mask field, controlling which levels (tile sizes) are - * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to - * big tiles and small polygons to small tiles -- would be realized here as - * well. As long as there are polygons at all needing tiling, we always have to - * have big tiles available, in case there are big polygons. But we don't - * necessarily need small tiles available. Ideally, when there are small - * polygons, small tiles are enabled (to avoid waste from putting small - * triangles in the big tiles); when there are not, small tiles are disabled to - * avoid enabling more levels than necessary, which potentially costs in memory - * bandwidth / power / tiler performance. - * - * Of course, the driver has to figure this out statically. When tile - * hiearchies are actually established, this occurs by the tiler in - * fixed-function hardware, after the vertex shaders have run and there is - * sufficient information to figure out the size of triangles. The driver has - * no such luxury, again barring insane hacks like additionally running the - * vertex shaders in software or in hardware via transform feedback. Thus, for - * the driver, we need a heuristic approach. - * - * There are lots of heuristics to guess triangle size statically you could - * imagine, but one approach shines as particularly simple-stupid: assume all - * on-screen triangles are equal size and spread equidistantly throughout the - * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with - * it, then we see: - * - * Triangle Area = (Screen Area / # of triangles) - * = (Width * Height) / (# of triangles) - * - * Or if you prefer, we can also make a third CRAZY assumption that we only draw - * right triangles with edges parallel/perpendicular to the sides of the screen - * with no overdraw, forming a triangle grid across the screen: - * - * |--w--| - * _____ | - * | /| /| | - * |/_|/_| h - * | /| /| | - * |/_|/_| | - * - * Then you can use some middle school geometry and algebra to work out the - * triangle dimensions. I started working on this, but realised I didn't need - * to to make my point, but couldn't bare to erase that ASCII art. Anyway. - * - * POINT IS, by considering the ratio of screen area and triangle count, we can - * estimate the triangle size. For a small size, use small bins; for a large - * size, use large bins. Intuitively, this metric makes sense: when there are - * few triangles on a large screen, you're probably compositing a UI and - * therefore the triangles are large; when there are a lot of triangles on a - * small screen, you're probably rendering a 3D mesh and therefore the - * triangles are tiny. (Or better said -- there will be tiny triangles, even if - * there are also large triangles. There have to be unless you expect crazy - * overdraw. Generally, it's better to allow more small bin sizes than - * necessary than not allow enough.) - * - * From this heuristic (or whatever), we determine the minimum allowable tile - * size, and we use that to decide the hierarchy masking, selecting from the - * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice). - * - * Once we have that mask and the framebuffer dimensions, we can compute the - * size of the statically-sized polygon list structures, allocate them, and go! - * - * ----- - * - * On T720, T820, and T830, there is no support for hierarchical tiling. - * Instead, the hardware allows the driver to select the tile size dynamically - * on a per-framebuffer basis, including allowing rectangular/non-square tiles. - * Rules for tile size selection are as follows: - * - * - Dimensions must be powers-of-two. - * - The smallest tile is 16x16. - * - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix) - * - There must be no more than 64 tiles in either dimension. - * - * Within these constraints, the driver is free to pick a tile size according - * to some heuristic, similar to units with an advanced tiling unit. - * - * To pick a size without any heuristics, we may satisfy the constraints by - * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size - * constraint, consider: - * - * # of tiles < 64 - * ceil (fb / tile) < 64 - * (fb / tile) <= (64 - 1) - * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1)) - * - * Hence we clamp up to align_pot(fb / (64 - 1)). - - * Extending to use a selection heuristic left for future work. - * - * Once the tile size (w, h) is chosen, we compute the hierarchy "mask": - * - * hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16) - * - * Of course with no hierarchical tiling, this is not a mask; it's just a field - * specifying the tile size. But I digress. - * - * We also compute the polgon list sizes (with framebuffer size W, H) as: - * - * full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h) - * offset = 8 * ceil(W / w) * ceil(H / h) - * - * It further appears necessary to round down offset to the nearest 0x200. - * Possibly we would also round down full_size to the nearest 0x200 but - * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's - * nothing to do. - */ - -/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */ - -#define MIN_TILE_SIZE 16 -#define MAX_TILE_SIZE 4096 - -/* Constants as shifts for easier power-of-two iteration */ - -#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE) -#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE) - -/* The hierarchy has a 64-byte prologue */ -#define PROLOGUE_SIZE 0x40 - -/* For each tile (across all hierarchy levels), there is 8 bytes of header */ -#define HEADER_BYTES_PER_TILE 0x8 - -/* Likewise, each tile per level has 512 bytes of body */ -#define FULL_BYTES_PER_TILE 0x200 - -/* If the width-x-height framebuffer is divided into tile_size-x-tile_size - * tiles, how many tiles are there? Rounding up in each direction. For the - * special case of tile_size=16, this aligns with the usual Midgard count. - * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum, - * because those care about the stride (not just the overall count) and only at - * a a fixed-tile size (not any of a number of power-of-twos) */ - -static unsigned -pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height) -{ - unsigned aligned_width = ALIGN_POT(width, tile_width); - unsigned aligned_height = ALIGN_POT(height, tile_height); - - unsigned tile_count_x = aligned_width / tile_width; - unsigned tile_count_y = aligned_height / tile_height; - - return tile_count_x * tile_count_y; -} - -/* For `masked_count` of the smallest tile sizes masked out, computes how the - * size of the polygon list header. We iterate the tile sizes (16x16 through - * 2048x2048). For each tile size, we figure out how many tiles there are at - * this hierarchy level and therefore many bytes this level is, leaving us with - * a byte count for each level. We then just sum up the byte counts across the - * levels to find a byte count for all levels. */ - -static unsigned -panfrost_hierarchy_size( - unsigned width, - unsigned height, - unsigned mask, - unsigned bytes_per_tile) -{ - unsigned size = PROLOGUE_SIZE; - - /* Iterate hierarchy levels */ - - for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) { - /* Check if this level is enabled */ - if (!(mask & (1 << b))) - continue; - - /* Shift from a level to a tile size */ - unsigned tile_size = (1 << b) * MIN_TILE_SIZE; - - unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size); - unsigned level_count = bytes_per_tile * tile_count; - - size += level_count; - } - - /* This size will be used as an offset, so ensure it's aligned */ - return ALIGN_POT(size, 0x200); -} - -/* Implement the formula: - * - * 0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h) - * - * rounding down the answer to the nearest 0x200. This is used to compute both - * header and body sizes for GPUs without hierarchical tiling. Essentially, - * computing a single hierarchy level, since there isn't any hierarchy! - */ - -static unsigned -panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile) -{ - /* First, extract the tile dimensions */ - - unsigned tw = (1 << (dim & 0b111)) * 8; - unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8; - - /* tile_count is ceil(W/w) * ceil(H/h) */ - unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile; - - /* Round down and add offset */ - return 0x200 + ((raw / 0x200) * 0x200); -} - -/* Given a hierarchy mask and a framebuffer size, compute the header size */ - -unsigned -panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) -{ - if (hierarchy) - return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE); - else - return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE); -} - -/* The combined header/body is sized similarly (but it is significantly - * larger), except that it can be empty when the tiler disabled, rather than - * getting clamped to a minimum size. - */ - -unsigned -panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) -{ - if (hierarchy) - return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE); - else - return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE); -} - -/* On GPUs without hierarchical tiling, we choose a tile size directly and - * stuff it into the field otherwise known as hierarchy mask (not a mask). */ - -static unsigned -panfrost_choose_tile_size( - unsigned width, unsigned height, unsigned vertex_count) -{ - /* Figure out the ideal tile size. Eventually a heuristic should be - * used for this */ - - unsigned best_w = 16; - unsigned best_h = 16; - - /* Clamp so there are less than 64 tiles in each direction */ - - best_w = MAX2(best_w, util_next_power_of_two(width / 63)); - best_h = MAX2(best_h, util_next_power_of_two(height / 63)); - - /* We have our ideal tile size, so encode */ - - unsigned exp_w = util_logbase2(best_w / 16); - unsigned exp_h = util_logbase2(best_h / 16); - - return exp_w | (exp_h << 6); -} - -/* In the future, a heuristic to choose a tiler hierarchy mask would go here. - * At the moment, we just default to 0xFF, which enables all possible hierarchy - * levels. Overall this yields good performance but presumably incurs a cost in - * memory bandwidth / power consumption / etc, at least on smaller scenes that - * don't really need all the smaller levels enabled */ - -unsigned -panfrost_choose_hierarchy_mask( - unsigned width, unsigned height, - unsigned vertex_count, bool hierarchy) -{ - /* If there is no geometry, we don't bother enabling anything */ - - if (!vertex_count) - return 0x00; - - if (!hierarchy) - return panfrost_choose_tile_size(width, height, vertex_count); - - /* Otherwise, default everything on. TODO: Proper tests */ - - return 0xFF; -} diff --git a/src/panfrost/encoder/pan_util.h b/src/panfrost/encoder/pan_util.h deleted file mode 100644 index 5f40fc93633..00000000000 --- a/src/panfrost/encoder/pan_util.h +++ /dev/null @@ -1,41 +0,0 @@ -/************************************************************************** - * - * Copyright 2019 Collabora, Ltd. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#ifndef PAN_UTIL_H -#define PAN_UTIL_H - -#define PAN_DBG_MSGS 0x0001 -#define PAN_DBG_TRACE 0x0002 -#define PAN_DBG_DEQP 0x0004 -#define PAN_DBG_AFBC 0x0008 -#define PAN_DBG_SYNC 0x0010 -#define PAN_DBG_PRECOMPILE 0x0020 -#define PAN_DBG_NOFP16 0x0040 -#define PAN_DBG_BIFROST 0x0080 -#define PAN_DBG_GL3 0x0100 - -#endif /* PAN_UTIL_H */ diff --git a/src/panfrost/lib/meson.build b/src/panfrost/lib/meson.build new file mode 100644 index 00000000000..a6b39baf4e6 --- /dev/null +++ b/src/panfrost/lib/meson.build @@ -0,0 +1,48 @@ +# Copyright © 2018 Rob Clark +# Copyright © 2019 Collabora + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +libpanfrost_lib_files = files( + 'pan_encoder.h', + + 'pan_afbc.c', + 'pan_attributes.c', + 'pan_bo.c', + 'pan_blit.c', + 'pan_format.c', + 'pan_invocation.c', + 'pan_sampler.c', + 'pan_tiler.c', + 'pan_texture.c', + 'pan_scoreboard.c', + 'pan_scratch.c', + 'pan_pool.c', + 'pan_props.c', +) + +libpanfrost_lib = static_library( + 'panfrost_lib', + [libpanfrost_lib_files], + include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw], + c_args : [no_override_init_args], + gnu_symbol_visibility : 'hidden', + dependencies: [dep_libdrm, idep_nir], + build_by_default : false, +) diff --git a/src/panfrost/lib/pan_afbc.c b/src/panfrost/lib/pan_afbc.c new file mode 100644 index 00000000000..f1f62baffc9 --- /dev/null +++ b/src/panfrost/lib/pan_afbc.c @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include "pan_texture.h" + +/* Arm FrameBuffer Compression (AFBC) is a lossless compression scheme natively + * implemented in Mali GPUs (as well as many display controllers paired with + * Mali GPUs, etc). Where possible, Panfrost prefers to use AFBC for both + * rendering and texturing. In most cases, this is a performance-win due to a + * dramatic reduction in memory bandwidth and cache locality compared to a + * linear resources. + * + * AFBC divides the framebuffer into 16x16 tiles (other sizes possible, TODO: + * do we need to support this?). So, the width and height each must be aligned + * up to 16 pixels. This is inherently good for performance; note that for a 4 + * byte-per-pixel format like RGBA8888, that means that rows are 16*4=64 byte + * aligned, which is the cache-line size. + * + * For each AFBC-compressed resource, there is a single contiguous + * (CPU/GPU-shared) buffer. This buffer itself is divided into two parts: + * header and body, placed immediately after each other. + * + * The AFBC header contains 16 bytes of metadata per tile. + * + * The AFBC body is the same size as the original linear resource (padded to + * the nearest tile). Although the body comes immediately after the header, it + * must also be cache-line aligned, so there can sometimes be a bit of padding + * between the header and body. + * + * As an example, a 64x64 RGBA framebuffer contains 64/16 = 4 tiles horizontally and + * 4 tiles vertically. There are 4*4=16 tiles in total, each containing 16 + * bytes of metadata, so there is a 16*16=256 byte header. 64x64 is already + * tile aligned, so the body is 64*64 * 4 bytes per pixel = 16384 bytes of + * body. + * + * From userspace, Panfrost needs to be able to calculate these sizes. It + * explicitly does not and can not know the format of the data contained within + * this header and body. The GPU has native support for AFBC encode/decode. For + * an internal FBO or a framebuffer used for scanout with an AFBC-compatible + * winsys/display-controller, the buffer is maintained AFBC throughout flight, + * and the driver never needs to know the internal data. For edge cases where + * the driver really does need to read/write from the AFBC resource, we + * generate a linear staging buffer and use the GPU to blit AFBC<--->linear. + * TODO: Implement me. */ + +#define AFBC_TILE_WIDTH 16 +#define AFBC_TILE_HEIGHT 16 +#define AFBC_HEADER_BYTES_PER_TILE 16 +#define AFBC_CACHE_ALIGN 64 + +/* Is it possible to AFBC compress a particular format? Common formats (and + * YUV) are compressible. Some obscure formats are not and fallback on linear, + * at a performance hit. Also, if you need to disable AFBC entirely in the + * driver for debug/profiling, just always return false here. */ + +bool +panfrost_format_supports_afbc(enum pipe_format format) +{ + const struct util_format_description *desc = + util_format_description(format); + + /* sRGB cannot be AFBC, but it can be tiled. TODO: Verify. The blob + * does not do AFBC for SRGB8_ALPHA8, but it's not clear why it + * shouldn't be able to. */ + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + return false; + + if (util_format_is_rgba8_variant(desc)) + return true; + + /* Only Z24S8 variants are compressible as Z/S */ + + if (panfrost_is_z24s8_variant(format)) + return true; + + /* Lookup special formats */ + switch (format) { + case PIPE_FORMAT_R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8_UNORM: + case PIPE_FORMAT_R5G6B5_UNORM: + case PIPE_FORMAT_B5G6R5_UNORM: + return true; + default: + return false; + } +} + +unsigned +panfrost_afbc_header_size(unsigned width, unsigned height) +{ + /* Align to tile */ + unsigned aligned_width = ALIGN_POT(width, AFBC_TILE_WIDTH); + unsigned aligned_height = ALIGN_POT(height, AFBC_TILE_HEIGHT); + + /* Compute size in tiles, rather than pixels */ + unsigned tile_count_x = aligned_width / AFBC_TILE_WIDTH; + unsigned tile_count_y = aligned_height / AFBC_TILE_HEIGHT; + unsigned tile_count = tile_count_x * tile_count_y; + + /* Multiply to find the header size */ + unsigned header_bytes = tile_count * AFBC_HEADER_BYTES_PER_TILE; + + /* Align and go */ + return ALIGN_POT(header_bytes, AFBC_CACHE_ALIGN); + +} diff --git a/src/panfrost/lib/pan_attributes.c b/src/panfrost/lib/pan_attributes.c new file mode 100644 index 00000000000..d0d79486185 --- /dev/null +++ b/src/panfrost/lib/pan_attributes.c @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "util/u_math.h" +#include "panfrost-job.h" +#include "pan_encoder.h" + +/* This file handles attribute descriptors (mali_attr_meta). The + * bulk of the complexity is from instancing. See mali_job for + * notes on how this works. But basically, for small vertex + * counts, we have a lookup table, and for large vertex counts, + * we look at the high bits as a heuristic. This has to match + * exactly how the hardware calculates this (which is why the + * algorithm is so weird) or else instancing will break. */ + +/* Given an odd number (of the form 2k + 1), compute k */ +#define ODD(odd) ((odd - 1) >> 1) + +static unsigned +panfrost_small_padded_vertex_count(unsigned idx) +{ + if (idx == 11 || idx == 13 || idx == 15 || idx == 19) + return idx + 1; + else + return idx; +} + +static unsigned +panfrost_large_padded_vertex_count(uint32_t vertex_count) +{ + /* First, we have to find the highest set one */ + unsigned highest = 32 - __builtin_clz(vertex_count); + + /* Using that, we mask out the highest 4-bits */ + unsigned n = highest - 4; + unsigned nibble = (vertex_count >> n) & 0xF; + + /* Great, we have the nibble. Now we can just try possibilities. Note + * that we don't care about the bottom most bit in most cases, and we + * know the top bit must be 1 */ + + unsigned middle_two = (nibble >> 1) & 0x3; + + switch (middle_two) { + case 0b00: + if (!(nibble & 1)) + return (1 << n) * 9; + else + return (1 << (n + 1)) * 5; + case 0b01: + return (1 << (n + 2)) * 3; + case 0b10: + return (1 << (n + 1)) * 7; + case 0b11: + return (1 << (n + 4)); + default: + return 0; /* unreachable */ + } +} + +unsigned +panfrost_padded_vertex_count(unsigned vertex_count) +{ + if (vertex_count < 20) + return panfrost_small_padded_vertex_count(vertex_count); + else + return panfrost_large_padded_vertex_count(vertex_count); +} + +/* The much, much more irritating case -- instancing is enabled. See + * panfrost_job.h for notes on how this works */ + +static unsigned +panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags) +{ + /* We have a NPOT divisor. Here's the fun one (multipling by + * the inverse and shifting) */ + + /* floor(log2(d)) */ + unsigned shift = util_logbase2(hw_divisor); + + /* m = ceil(2^(32 + shift) / d) */ + uint64_t shift_hi = 32 + shift; + uint64_t t = 1ll << shift_hi; + double t_f = t; + double hw_divisor_d = hw_divisor; + double m_f = ceil(t_f / hw_divisor_d); + unsigned m = m_f; + + /* Default case */ + uint32_t magic_divisor = m; + + /* e = 2^(shift + 32) % d */ + uint64_t e = t % hw_divisor; + + /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob + * seems to use a different condition */ + if (e <= (1ll << shift)) { + magic_divisor = m - 1; + *extra_flags = 1; + } + + /* Top flag implicitly set */ + assert(magic_divisor & (1u << 31)); + magic_divisor &= ~(1u << 31); + *o_shift = shift; + + return magic_divisor; +} + +unsigned +panfrost_vertex_instanced( + unsigned padded_count, + unsigned instance_shift, unsigned instance_odd, + unsigned divisor, + union mali_attr *attrs) +{ + /* Depending if there is an instance divisor or not, packing varies. + * When there is a divisor, the hardware-level divisor is actually the + * product of the instance divisor and the padded count */ + + unsigned hw_divisor = padded_count * divisor; + + if (divisor == 0) { + /* Per-vertex attributes use the MODULO mode. First, compute + * the modulus */ + + attrs->elements |= MALI_ATTR_MODULO; + attrs->shift = instance_shift; + attrs->extra_flags = instance_odd; + + return 1; + } else if (util_is_power_of_two_or_zero(hw_divisor)) { + /* If there is a divisor but the hardware divisor works out to + * a power of two (not terribly exceptional), we can use an + * easy path (just shifting) */ + + attrs->elements |= MALI_ATTR_POT_DIVIDE; + attrs->shift = __builtin_ctz(hw_divisor); + + return 1; + } else { + unsigned shift = 0, extra_flags = 0; + + attrs[1].magic_divisor = + panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags); + + /* Upload to two different slots */ + + attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE; + attrs[0].shift = shift; + attrs[0].extra_flags = extra_flags; + + attrs[1].unk = 0x20; + attrs[1].zero = 0; + attrs[1].divisor = divisor; + + return 2; + } +} + +/* Records for gl_VertexID and gl_InstanceID use a slightly special encoding, + * but the idea is the same */ + +void +panfrost_vertex_id( + unsigned padded_count, + union mali_attr *attr) +{ + /* We factor the padded count as shift/odd and that's it */ + + attr->elements = MALI_ATTR_VERTEXID; + attr->shift = __builtin_ctz(padded_count); + attr->extra_flags = padded_count >> (attr->shift + 1); + attr->stride = attr->size = 0; +} + +void +panfrost_instance_id( + unsigned padded_count, + union mali_attr *attr) +{ + attr->elements = MALI_ATTR_INSTANCEID; + attr->stride = 0; + attr->extra_flags = 0; + attr->size = 0; + + /* POT records have just a shift directly with an off-by-one for + * unclear reasons. NPOT records have a magic divisor smushed into the + * stride field (which is unused for these special records) */ + + if (util_is_power_of_two_or_zero(padded_count)) { + attr->shift = __builtin_ctz(padded_count) - 1; + } else { + unsigned shift = 0, flags = 0; + + attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags); + attr->shift = shift; + attr->extra_flags = flags; + } +} + diff --git a/src/panfrost/lib/pan_blit.c b/src/panfrost/lib/pan_blit.c new file mode 100644 index 00000000000..ece664bb5b9 --- /dev/null +++ b/src/panfrost/lib/pan_blit.c @@ -0,0 +1,370 @@ +/* + * Copyright (C) 2020 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include +#include +#include "pan_encoder.h" +#include "pan_pool.h" +#include "pan_scoreboard.h" +#include "pan_texture.h" +#include "panfrost-quirks.h" +#include "../midgard/midgard_compile.h" +#include "compiler/nir/nir_builder.h" +#include "util/u_math.h" + +/* On Midgard, the native blit infrastructure (via MFBD preloads) is broken or + * missing in many cases. We instead use software paths as fallbacks to + * implement blits, which are done as TILER jobs. No vertex shader is + * necessary since we can supply screen-space coordinates directly. + * + * This is primarily designed as a fallback for preloads but could be extended + * for other clears/blits if needed in the future. */ + +static void +panfrost_build_blit_shader(panfrost_program *program, unsigned gpu_id, gl_frag_result loc, nir_alu_type T, bool ms) +{ + bool is_colour = loc >= FRAG_RESULT_DATA0; + + nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_FRAGMENT, &midgard_nir_options, NULL); + nir_function *fn = nir_function_create(shader, "main"); + nir_function_impl *impl = nir_function_impl_create(fn); + + nir_variable *c_src = nir_variable_create(shader, nir_var_shader_in, glsl_vector_type(GLSL_TYPE_FLOAT, 2), "coord"); + nir_variable *c_out = nir_variable_create(shader, nir_var_shader_out, glsl_vector_type( + GLSL_TYPE_FLOAT, is_colour ? 4 : 1), "out"); + + c_src->data.location = VARYING_SLOT_TEX0; + c_out->data.location = loc; + + nir_builder _b; + nir_builder *b = &_b; + nir_builder_init(b, impl); + b->cursor = nir_before_block(nir_start_block(impl)); + + nir_ssa_def *coord = nir_load_var(b, c_src); + + nir_tex_instr *tex = nir_tex_instr_create(shader, ms ? 3 : 1); + + tex->dest_type = T; + + if (ms) { + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(nir_f2i32(b, coord)); + tex->coord_components = 2; + + tex->src[1].src_type = nir_tex_src_ms_index; + tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b)); + + tex->src[2].src_type = nir_tex_src_lod; + tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0)); + tex->sampler_dim = GLSL_SAMPLER_DIM_MS; + tex->op = nir_texop_txf_ms; + } else { + tex->op = nir_texop_tex; + + tex->src[0].src_type = nir_tex_src_coord; + tex->src[0].src = nir_src_for_ssa(coord); + tex->coord_components = 2; + + tex->sampler_dim = GLSL_SAMPLER_DIM_2D; + } + + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); + nir_builder_instr_insert(b, &tex->instr); + + if (is_colour) + nir_store_var(b, c_out, &tex->dest.ssa, 0xFF); + else + nir_store_var(b, c_out, nir_channel(b, &tex->dest.ssa, 0), 0xFF); + + midgard_compile_shader_nir(shader, program, false, 0, gpu_id, false, true); +} + +/* Compile and upload all possible blit shaders ahead-of-time to reduce draw + * time overhead. There's only ~30 of them at the moment, so this is fine */ + +void +panfrost_init_blit_shaders(struct panfrost_device *dev) +{ + static const struct { + gl_frag_result loc; + unsigned types; + } shader_descs[] = { + { FRAG_RESULT_DEPTH, 1 << PAN_BLIT_FLOAT }, + { FRAG_RESULT_STENCIL, 1 << PAN_BLIT_UINT }, + { FRAG_RESULT_DATA0, ~0 }, + { FRAG_RESULT_DATA1, ~0 }, + { FRAG_RESULT_DATA2, ~0 }, + { FRAG_RESULT_DATA3, ~0 }, + { FRAG_RESULT_DATA4, ~0 }, + { FRAG_RESULT_DATA5, ~0 }, + { FRAG_RESULT_DATA6, ~0 }, + { FRAG_RESULT_DATA7, ~0 } + }; + + nir_alu_type nir_types[PAN_BLIT_NUM_TYPES] = { + nir_type_float, + nir_type_uint, + nir_type_int + }; + + /* Total size = # of shaders * bytes per shader. There are + * shaders for each RT (so up to DATA7 -- overestimate is + * okay) and up to NUM_TYPES variants of each, * 2 for multisampling + * variants. These shaders are simple enough that they should be less + * than 8 quadwords each (again, overestimate is fine). */ + + unsigned offset = 0; + unsigned total_size = (FRAG_RESULT_DATA7 * PAN_BLIT_NUM_TYPES) + * (8 * 16) * 2; + + dev->blit_shaders.bo = panfrost_bo_create(dev, total_size, PAN_BO_EXECUTE); + + /* Don't bother generating multisampling variants if we don't actually + * support multisampling */ + bool has_ms = !(dev->quirks & MIDGARD_SFBD); + + for (unsigned ms = 0; ms <= has_ms; ++ms) { + for (unsigned i = 0; i < ARRAY_SIZE(shader_descs); ++i) { + unsigned loc = shader_descs[i].loc; + + for (enum pan_blit_type T = 0; T < PAN_BLIT_NUM_TYPES; ++T) { + if (!(shader_descs[i].types & (1 << T))) + continue; + + panfrost_program program; + panfrost_build_blit_shader(&program, dev->gpu_id, loc, + nir_types[T], ms); + + assert(offset + program.compiled.size < total_size); + memcpy(dev->blit_shaders.bo->cpu + offset, program.compiled.data, program.compiled.size); + + dev->blit_shaders.loads[loc][T][ms] = (dev->blit_shaders.bo->gpu + offset) | program.first_tag; + offset += ALIGN_POT(program.compiled.size, 64); + util_dynarray_fini(&program.compiled); + } + } + } +} + +/* Add a shader-based load on Midgard (draw-time for GL). Shaders are + * precached */ + +void +panfrost_load_midg( + struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + mali_ptr blend_shader, + mali_ptr fbd, + mali_ptr coordinates, unsigned vertex_count, + struct pan_image *image, + unsigned loc) +{ + unsigned width = u_minify(image->width0, image->first_level); + unsigned height = u_minify(image->height0, image->first_level); + + struct mali_viewport viewport = { + .clip_minx = -INFINITY, + .clip_miny = -INFINITY, + .clip_maxx = INFINITY, + .clip_maxy = INFINITY, + .clip_minz = 0.0, + .clip_maxz = 1.0, + + .viewport0 = { 0, 0 }, + .viewport1 = { MALI_POSITIVE(width), MALI_POSITIVE(height) } + }; + + union mali_attr varying = { + .elements = coordinates | MALI_ATTR_LINEAR, + .stride = 4 * sizeof(float), + .size = 4 * sizeof(float) * vertex_count, + }; + + struct mali_attr_meta varying_meta = { + .index = 0, + .unknown1 = 2, + .swizzle = (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3), + .format = MALI_RGBA32F + }; + + struct mali_stencil_test stencil = { + .mask = 0xFF, + .func = MALI_FUNC_ALWAYS, + .sfail = MALI_STENCIL_REPLACE, + .dpfail = MALI_STENCIL_REPLACE, + .dppass = MALI_STENCIL_REPLACE, + }; + + union midgard_blend replace = { + .equation = { + .rgb_mode = 0x122, + .alpha_mode = 0x122, + .color_mask = MALI_MASK_R | MALI_MASK_G | MALI_MASK_B | MALI_MASK_A, + } + }; + + if (blend_shader) + replace.shader = blend_shader; + + /* Determine the sampler type needed. Stencil is always sampled as + * UINT. Pure (U)INT is always (U)INT. Everything else is FLOAT. */ + + enum pan_blit_type T = + (loc == FRAG_RESULT_STENCIL) ? PAN_BLIT_UINT : + (util_format_is_pure_uint(image->format)) ? PAN_BLIT_UINT : + (util_format_is_pure_sint(image->format)) ? PAN_BLIT_INT : + PAN_BLIT_FLOAT; + + bool ms = image->nr_samples > 1; + + struct mali_shader_meta shader_meta = { + .shader = pool->dev->blit_shaders.loads[loc][T][ms], + .sampler_count = 1, + .texture_count = 1, + .varying_count = 1, + .midgard1 = { + .flags_lo = 0x20, + .work_count = 4, + }, + .coverage_mask = 0xF, + .unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10, + .unknown2_4 = 0x4e0, + .stencil_mask_front = ~0, + .stencil_mask_back = ~0, + .stencil_front = stencil, + .stencil_back = stencil, + .blend = { + .shader = blend_shader + } + }; + + if (ms) + shader_meta.unknown2_3 |= MALI_HAS_MSAA | MALI_PER_SAMPLE; + else + shader_meta.unknown2_4 |= MALI_NO_MSAA; + + assert(shader_meta.shader); + + if (pool->dev->quirks & MIDGARD_SFBD) { + shader_meta.unknown2_4 |= (0x10 | MALI_NO_DITHER); + shader_meta.blend = replace; + + if (loc < FRAG_RESULT_DATA0) + shader_meta.blend.equation.color_mask = 0x0; + } + + if (loc == FRAG_RESULT_DEPTH) { + shader_meta.midgard1.flags_lo |= MALI_WRITES_Z; + shader_meta.unknown2_3 |= MALI_DEPTH_WRITEMASK; + } else if (loc == FRAG_RESULT_STENCIL) { + shader_meta.midgard1.flags_hi |= MALI_WRITES_S; + shader_meta.unknown2_4 |= MALI_STENCIL_TEST; + } else { + shader_meta.midgard1.flags_lo |= MALI_EARLY_Z; + } + + /* Create the texture descriptor. We partially compute the base address + * ourselves to account for layer, such that the texture descriptor + * itself is for a 2D texture with array size 1 even for 3D/array + * textures, removing the need to separately key the blit shaders for + * 2D and 3D variants */ + + struct panfrost_transfer texture_t = panfrost_pool_alloc(pool, sizeof(struct mali_texture_descriptor) + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1)); + + panfrost_new_texture(texture_t.cpu, + image->width0, image->height0, + MAX2(image->nr_samples, 1), 1, + image->format, MALI_TEX_2D, + image->layout, + image->first_level, image->last_level, + 0, 0, + image->nr_samples, + 0, + (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9), + image->bo->gpu + image->first_layer * + panfrost_get_layer_stride(image->slices, + image->type == MALI_TEX_3D, + image->cubemap_stride, image->first_level), + image->slices); + + struct mali_sampler_descriptor sampler = { + .filter_mode = MALI_SAMP_MAG_NEAREST | MALI_SAMP_MIN_NEAREST, + .wrap_s = MALI_WRAP_CLAMP_TO_EDGE, + .wrap_t = MALI_WRAP_CLAMP_TO_EDGE, + .wrap_r = MALI_WRAP_CLAMP_TO_EDGE, + }; + + struct panfrost_transfer shader_meta_t = panfrost_pool_alloc(pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt)); + memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta)); + + for (unsigned i = 0; i < 8; ++i) { + void *dest = shader_meta_t.cpu + sizeof(shader_meta) + sizeof(struct midgard_blend_rt) * i; + + if (loc == (FRAG_RESULT_DATA0 + i)) { + struct midgard_blend_rt blend_rt = { + .flags = 0x200 | MALI_BLEND_NO_DITHER, + .blend = replace, + }; + + if (util_format_is_srgb(image->format)) + blend_rt.flags |= MALI_BLEND_SRGB; + + if (blend_shader) { + blend_rt.flags |= MALI_BLEND_MRT_SHADER; + blend_rt.blend.shader = blend_shader; + } + + memcpy(dest, &blend_rt, sizeof(struct midgard_blend_rt)); + } else { + memset(dest, 0x0, sizeof(struct midgard_blend_rt)); + } + } + + struct midgard_payload_vertex_tiler payload = { + .prefix = { + .draw_mode = MALI_TRIANGLES, + .unknown_draw = 0x3000, + .index_count = MALI_POSITIVE(vertex_count) + }, + .postfix = { + .gl_enables = 0x7, + .position_varying = coordinates, + .textures = panfrost_pool_upload(pool, &texture_t.gpu, sizeof(texture_t.gpu)), + .sampler_descriptor = panfrost_pool_upload(pool, &sampler, sizeof(sampler)), + .shader = shader_meta_t.gpu, + .varyings = panfrost_pool_upload(pool, &varying, sizeof(varying)), + .varying_meta = panfrost_pool_upload(pool, &varying_meta, sizeof(varying_meta)), + .viewport = panfrost_pool_upload(pool, &viewport, sizeof(viewport)), + .shared_memory = fbd + } + }; + + panfrost_pack_work_groups_compute(&payload.prefix, 1, vertex_count, 1, 1, 1, 1, true); + payload.prefix.workgroups_x_shift_3 = 6; + + panfrost_new_job(pool, scoreboard, JOB_TYPE_TILER, false, 0, &payload, sizeof(payload), true); +} diff --git a/src/panfrost/lib/pan_bo.c b/src/panfrost/lib/pan_bo.c new file mode 100644 index 00000000000..03a83c4a755 --- /dev/null +++ b/src/panfrost/lib/pan_bo.c @@ -0,0 +1,514 @@ +/* + * Copyright 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ +#include +#include +#include +#include +#include +#include "drm-uapi/panfrost_drm.h" + +#include "pan_bo.h" +#include "pan_util.h" +#include "wrap.h" + +#include "os/os_mman.h" + +#include "util/u_inlines.h" +#include "util/u_math.h" + +/* This file implements a userspace BO cache. Allocating and freeing + * GPU-visible buffers is very expensive, and even the extra kernel roundtrips + * adds more work than we would like at this point. So caching BOs in userspace + * solves both of these problems and does not require kernel updates. + * + * Cached BOs are sorted into a bucket based on rounding their size down to the + * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo + * objects. Putting a BO into the cache is accomplished by adding it to the + * corresponding bucket. Getting a BO from the cache consists of finding the + * appropriate bucket and sorting. A cache eviction is a kernel-level free of a + * BO and removing it from the bucket. We special case evicting all BOs from + * the cache, since that's what helpful in practice and avoids extra logic + * around the linked list. + */ + +static struct panfrost_bo * +panfrost_bo_alloc(struct panfrost_device *dev, size_t size, + uint32_t flags) +{ + struct drm_panfrost_create_bo create_bo = { .size = size }; + struct panfrost_bo *bo; + int ret; + + if (dev->kernel_version->version_major > 1 || + dev->kernel_version->version_minor >= 1) { + if (flags & PAN_BO_GROWABLE) + create_bo.flags |= PANFROST_BO_HEAP; + if (!(flags & PAN_BO_EXECUTE)) + create_bo.flags |= PANFROST_BO_NOEXEC; + } + + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo); + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n"); + return NULL; + } + + bo = pan_lookup_bo(dev, create_bo.handle); + assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo))); + + bo->size = create_bo.size; + bo->gpu = create_bo.offset; + bo->gem_handle = create_bo.handle; + bo->flags = flags; + bo->dev = dev; + return bo; +} + +static void +panfrost_bo_free(struct panfrost_bo *bo) +{ + struct drm_gem_close gem_close = { .handle = bo->gem_handle }; + int ret; + + ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close); + if (ret) { + fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n"); + assert(0); + } + + /* BO will be freed with the sparse array, but zero to indicate free */ + memset(bo, 0, sizeof(*bo)); +} + +/* Returns true if the BO is ready, false otherwise. + * access_type is encoding the type of access one wants to ensure is done. + * Waiting is always done for writers, but if wait_readers is set then readers + * are also waited for. + */ +bool +panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers) +{ + struct drm_panfrost_wait_bo req = { + .handle = bo->gem_handle, + .timeout_ns = timeout_ns, + }; + int ret; + + /* If the BO has been exported or imported we can't rely on the cached + * state, we need to call the WAIT_BO ioctl. + */ + if (!(bo->flags & PAN_BO_SHARED)) { + /* If ->gpu_access is 0, the BO is idle, no need to wait. */ + if (!bo->gpu_access) + return true; + + /* If the caller only wants to wait for writers and no + * writes are pending, we don't have to wait. + */ + if (!wait_readers && !(bo->gpu_access & PAN_BO_ACCESS_WRITE)) + return true; + } + + /* The ioctl returns >= 0 value when the BO we are waiting for is ready + * -1 otherwise. + */ + ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req); + if (ret != -1) { + /* Set gpu_access to 0 so that the next call to bo_wait() + * doesn't have to call the WAIT_BO ioctl. + */ + bo->gpu_access = 0; + return true; + } + + /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed + * is invalid, which shouldn't happen here. + */ + assert(errno == ETIMEDOUT || errno == EBUSY); + return false; +} + +/* Helper to calculate the bucket index of a BO */ + +static unsigned +pan_bucket_index(unsigned size) +{ + /* Round down to POT to compute a bucket index */ + + unsigned bucket_index = util_logbase2(size); + + /* Clamp the bucket index; all huge allocations will be + * sorted into the largest bucket */ + + bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET); + + /* The minimum bucket size must equal the minimum allocation + * size; the maximum we clamped */ + + assert(bucket_index >= MIN_BO_CACHE_BUCKET); + assert(bucket_index <= MAX_BO_CACHE_BUCKET); + + /* Reindex from 0 */ + return (bucket_index - MIN_BO_CACHE_BUCKET); +} + +static struct list_head * +pan_bucket(struct panfrost_device *dev, unsigned size) +{ + return &dev->bo_cache.buckets[pan_bucket_index(size)]; +} + +/* Tries to fetch a BO of sufficient size with the appropriate flags from the + * BO cache. If it succeeds, it returns that BO and removes the BO from the + * cache. If it fails, it returns NULL signaling the caller to allocate a new + * BO. */ + +static struct panfrost_bo * +panfrost_bo_cache_fetch(struct panfrost_device *dev, + size_t size, uint32_t flags, bool dontwait) +{ + pthread_mutex_lock(&dev->bo_cache.lock); + struct list_head *bucket = pan_bucket(dev, size); + struct panfrost_bo *bo = NULL; + + /* Iterate the bucket looking for something suitable */ + list_for_each_entry_safe(struct panfrost_bo, entry, bucket, + bucket_link) { + if (entry->size < size || entry->flags != flags) + continue; + + if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX, + PAN_BO_ACCESS_RW)) + continue; + + struct drm_panfrost_madvise madv = { + .handle = entry->gem_handle, + .madv = PANFROST_MADV_WILLNEED, + }; + int ret; + + /* This one works, splice it out of the cache */ + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + if (!ret && !madv.retained) { + panfrost_bo_free(entry); + continue; + } + /* Let's go! */ + bo = entry; + break; + } + pthread_mutex_unlock(&dev->bo_cache.lock); + + return bo; +} + +static void +panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev) +{ + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + list_for_each_entry_safe(struct panfrost_bo, entry, + &dev->bo_cache.lru, lru_link) { + /* We want all entries that have been used more than 1 sec + * ago to be dropped, others can be kept. + * Note the <= 2 check and not <= 1. It's here to account for + * the fact that we're only testing ->tv_sec, not ->tv_nsec. + * That means we might keep entries that are between 1 and 2 + * seconds old, but we don't really care, as long as unused BOs + * are dropped at some point. + */ + if (time.tv_sec - entry->last_used <= 2) + break; + + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + panfrost_bo_free(entry); + } +} + +/* Tries to add a BO to the cache. Returns if it was + * successful */ + +static bool +panfrost_bo_cache_put(struct panfrost_bo *bo) +{ + struct panfrost_device *dev = bo->dev; + + if (bo->flags & PAN_BO_SHARED) + return false; + + pthread_mutex_lock(&dev->bo_cache.lock); + struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096)); + struct drm_panfrost_madvise madv; + struct timespec time; + + madv.handle = bo->gem_handle; + madv.madv = PANFROST_MADV_DONTNEED; + madv.retained = 0; + + drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv); + + /* Add us to the bucket */ + list_addtail(&bo->bucket_link, bucket); + + /* Add us to the LRU list and update the last_used field. */ + list_addtail(&bo->lru_link, &dev->bo_cache.lru); + clock_gettime(CLOCK_MONOTONIC, &time); + bo->last_used = time.tv_sec; + + /* Let's do some cleanup in the BO cache while we hold the + * lock. + */ + panfrost_bo_cache_evict_stale_bos(dev); + pthread_mutex_unlock(&dev->bo_cache.lock); + + return true; +} + +/* Evicts all BOs from the cache. Called during context + * destroy or during low-memory situations (to free up + * memory that may be unused by us just sitting in our + * cache, but still reserved from the perspective of the + * OS) */ + +void +panfrost_bo_cache_evict_all( + struct panfrost_device *dev) +{ + pthread_mutex_lock(&dev->bo_cache.lock); + for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) { + struct list_head *bucket = &dev->bo_cache.buckets[i]; + + list_for_each_entry_safe(struct panfrost_bo, entry, bucket, + bucket_link) { + list_del(&entry->bucket_link); + list_del(&entry->lru_link); + panfrost_bo_free(entry); + } + } + pthread_mutex_unlock(&dev->bo_cache.lock); +} + +void +panfrost_bo_mmap(struct panfrost_bo *bo) +{ + struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle }; + int ret; + + if (bo->cpu) + return; + + ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo); + if (ret) { + fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n"); + assert(0); + } + + bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED, + bo->dev->fd, mmap_bo.offset); + if (bo->cpu == MAP_FAILED) { + fprintf(stderr, "mmap failed: %p %m\n", bo->cpu); + assert(0); + } +} + +static void +panfrost_bo_munmap(struct panfrost_bo *bo) +{ + if (!bo->cpu) + return; + + if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) { + perror("munmap"); + abort(); + } + + bo->cpu = NULL; +} + +struct panfrost_bo * +panfrost_bo_create(struct panfrost_device *dev, size_t size, + uint32_t flags) +{ + struct panfrost_bo *bo; + + /* Kernel will fail (confusingly) with EPERM otherwise */ + assert(size > 0); + + /* To maximize BO cache usage, don't allocate tiny BOs */ + size = MAX2(size, 4096); + + /* GROWABLE BOs cannot be mmapped */ + if (flags & PAN_BO_GROWABLE) + assert(flags & PAN_BO_INVISIBLE); + + /* Before creating a BO, we first want to check the cache but without + * waiting for BO readiness (BOs in the cache can still be referenced + * by jobs that are not finished yet). + * If the cached allocation fails we fall back on fresh BO allocation, + * and if that fails too, we try one more time to allocate from the + * cache, but this time we accept to wait. + */ + bo = panfrost_bo_cache_fetch(dev, size, flags, true); + if (!bo) + bo = panfrost_bo_alloc(dev, size, flags); + if (!bo) + bo = panfrost_bo_cache_fetch(dev, size, flags, false); + + if (!bo) + fprintf(stderr, "BO creation failed\n"); + + assert(bo); + + /* Only mmap now if we know we need to. For CPU-invisible buffers, we + * never map since we don't care about their contents; they're purely + * for GPU-internal use. But we do trace them anyway. */ + + if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP))) + panfrost_bo_mmap(bo); + + p_atomic_set(&bo->refcnt, 1); + + if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) { + if (flags & PAN_BO_INVISIBLE) + pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL); + else if (!(flags & PAN_BO_DELAY_MMAP)) + pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL); + } + + return bo; +} + +void +panfrost_bo_reference(struct panfrost_bo *bo) +{ + if (bo) { + ASSERTED int count = p_atomic_inc_return(&bo->refcnt); + assert(count != 1); + } +} + +void +panfrost_bo_unreference(struct panfrost_bo *bo) +{ + if (!bo) + return; + + /* Don't return to cache if there are still references */ + if (p_atomic_dec_return(&bo->refcnt)) + return; + + struct panfrost_device *dev = bo->dev; + + pthread_mutex_lock(&dev->bo_map_lock); + + /* Someone might have imported this BO while we were waiting for the + * lock, let's make sure it's still not referenced before freeing it. + */ + if (p_atomic_read(&bo->refcnt) == 0) { + /* When the reference count goes to zero, we need to cleanup */ + panfrost_bo_munmap(bo); + + /* Rather than freeing the BO now, we'll cache the BO for later + * allocations if we're allowed to. + */ + if (!panfrost_bo_cache_put(bo)) + panfrost_bo_free(bo); + + } + pthread_mutex_unlock(&dev->bo_map_lock); +} + +struct panfrost_bo * +panfrost_bo_import(struct panfrost_device *dev, int fd) +{ + struct panfrost_bo *bo; + struct drm_panfrost_get_bo_offset get_bo_offset = {0,}; + ASSERTED int ret; + unsigned gem_handle; + + ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle); + assert(!ret); + + pthread_mutex_lock(&dev->bo_map_lock); + bo = pan_lookup_bo(dev, gem_handle); + + if (!bo->dev) { + get_bo_offset.handle = gem_handle; + ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset); + assert(!ret); + + bo->dev = dev; + bo->gpu = (mali_ptr) get_bo_offset.offset; + bo->size = lseek(fd, 0, SEEK_END); + bo->flags = PAN_BO_SHARED; + bo->gem_handle = gem_handle; + assert(bo->size > 0); + p_atomic_set(&bo->refcnt, 1); + // TODO map and unmap on demand? + panfrost_bo_mmap(bo); + } else { + /* bo->refcnt == 0 can happen if the BO + * was being released but panfrost_bo_import() acquired the + * lock before panfrost_bo_unreference(). In that case, refcnt + * is 0 and we can't use panfrost_bo_reference() directly, we + * have to re-initialize the refcnt(). + * Note that panfrost_bo_unreference() checks + * refcnt value just after acquiring the lock to + * make sure the object is not freed if panfrost_bo_import() + * acquired it in the meantime. + */ + if (p_atomic_read(&bo->refcnt) == 0) + p_atomic_set(&bo->refcnt, 1); + else + panfrost_bo_reference(bo); + assert(bo->cpu); + } + pthread_mutex_unlock(&dev->bo_map_lock); + + return bo; +} + +int +panfrost_bo_export(struct panfrost_bo *bo) +{ + struct drm_prime_handle args = { + .handle = bo->gem_handle, + .flags = DRM_CLOEXEC, + }; + + int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args); + if (ret == -1) + return -1; + + bo->flags |= PAN_BO_SHARED; + return args.fd; +} + diff --git a/src/panfrost/lib/pan_bo.h b/src/panfrost/lib/pan_bo.h new file mode 100644 index 00000000000..360b102de34 --- /dev/null +++ b/src/panfrost/lib/pan_bo.h @@ -0,0 +1,126 @@ +/* + * © Copyright 2019 Alyssa Rosenzweig + * © Copyright 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PAN_BO_H__ +#define __PAN_BO_H__ + +#include "util/list.h" +#include "pan_device.h" +#include + +/* Flags for allocated memory */ + +/* This memory region is executable */ +#define PAN_BO_EXECUTE (1 << 0) + +/* This memory region should be lazily allocated and grow-on-page-fault. Must + * be used in conjunction with INVISIBLE */ +#define PAN_BO_GROWABLE (1 << 1) + +/* This memory region should not be mapped to the CPU */ +#define PAN_BO_INVISIBLE (1 << 2) + +/* This region may not be used immediately and will not mmap on allocate + * (semantically distinct from INVISIBLE, which cannot never be mmaped) */ +#define PAN_BO_DELAY_MMAP (1 << 3) + +/* BO is shared across processes (imported or exported) and therefore cannot be + * cached locally */ +#define PAN_BO_SHARED (1 << 4) + +/* GPU access flags */ + +/* BO is either shared (can be accessed by more than one GPU batch) or private + * (reserved by a specific GPU job). */ +#define PAN_BO_ACCESS_PRIVATE (0 << 0) +#define PAN_BO_ACCESS_SHARED (1 << 0) + +/* BO is being read/written by the GPU */ +#define PAN_BO_ACCESS_READ (1 << 1) +#define PAN_BO_ACCESS_WRITE (1 << 2) +#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE) + +/* BO is accessed by the vertex/tiler job. */ +#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3) + +/* BO is accessed by the fragment job. */ +#define PAN_BO_ACCESS_FRAGMENT (1 << 4) + +struct panfrost_bo { + /* Must be first for casting */ + struct list_head bucket_link; + + /* Used to link the BO to the BO cache LRU list. */ + struct list_head lru_link; + + /* Store the time this BO was use last, so the BO cache logic can evict + * stale BOs. + */ + time_t last_used; + + /* Atomic reference count */ + int32_t refcnt; + + struct panfrost_device *dev; + + /* Mapping for the entire object (all levels) */ + uint8_t *cpu; + + /* GPU address for the object */ + mali_ptr gpu; + + /* Size of all entire trees */ + size_t size; + + int gem_handle; + + uint32_t flags; + + /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending + * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl + * when the BO is idle. + */ + uint32_t gpu_access; +}; + +bool +panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers); +void +panfrost_bo_reference(struct panfrost_bo *bo); +void +panfrost_bo_unreference(struct panfrost_bo *bo); +struct panfrost_bo * +panfrost_bo_create(struct panfrost_device *dev, size_t size, + uint32_t flags); +void +panfrost_bo_mmap(struct panfrost_bo *bo); +struct panfrost_bo * +panfrost_bo_import(struct panfrost_device *dev, int fd); +int +panfrost_bo_export(struct panfrost_bo *bo); +void +panfrost_bo_cache_evict_all(struct panfrost_device *dev); + +#endif /* __PAN_BO_H__ */ diff --git a/src/panfrost/lib/pan_device.h b/src/panfrost/lib/pan_device.h new file mode 100644 index 00000000000..b84c8e7cdae --- /dev/null +++ b/src/panfrost/lib/pan_device.h @@ -0,0 +1,147 @@ +/************************************************************************** + * + * Copyright 2018-2019 Alyssa Rosenzweig + * Copyright 2018-2019 Collabora, Ltd. + * Copyright © 2015 Intel Corporation + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef PAN_DEVICE_H +#define PAN_DEVICE_H + +#include +#include "renderonly/renderonly.h" +#include "util/u_dynarray.h" +#include "util/bitset.h" +#include "util/list.h" +#include "util/sparse_array.h" + +#include + +/* Driver limits */ +#define PAN_MAX_CONST_BUFFERS 16 + +/* Transient slab size. This is a balance between fragmentation against cache + * locality and ease of bookkeeping */ + +#define TRANSIENT_SLAB_PAGES (32) /* 128kb */ +#define TRANSIENT_SLAB_SIZE (4096 * TRANSIENT_SLAB_PAGES) + +/* Maximum number of transient slabs so we don't need dynamic arrays. Most + * interesting Mali boards are 4GB RAM max, so if the entire RAM was filled + * with transient slabs, you could never exceed (4GB / TRANSIENT_SLAB_SIZE) + * allocations anyway. By capping, we can use a fixed-size bitset for tracking + * free slabs, eliminating quite a bit of complexity. We can pack the free + * state of 8 slabs into a single byte, so for 128kb transient slabs the bitset + * occupies a cheap 4kb of memory */ + +#define MAX_TRANSIENT_SLABS (1024*1024 / TRANSIENT_SLAB_PAGES) + +/* How many power-of-two levels in the BO cache do we want? 2^12 + * minimum chosen as it is the page size that all allocations are + * rounded to */ + +#define MIN_BO_CACHE_BUCKET (12) /* 2^12 = 4KB */ +#define MAX_BO_CACHE_BUCKET (22) /* 2^22 = 4MB */ + +/* Fencepost problem, hence the off-by-one */ +#define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1) + +/* Cache for blit shaders. Defined here so they can be cached with the device */ + +enum pan_blit_type { + PAN_BLIT_FLOAT = 0, + PAN_BLIT_UINT, + PAN_BLIT_INT, + PAN_BLIT_NUM_TYPES, +}; + +#define PAN_BLIT_NUM_TARGETS (12) + +struct pan_blit_shaders { + struct panfrost_bo *bo; + mali_ptr loads[PAN_BLIT_NUM_TARGETS][PAN_BLIT_NUM_TYPES][2]; +}; + +struct panfrost_device { + /* For ralloc */ + void *memctx; + + int fd; + + /* Properties of the GPU in use */ + unsigned gpu_id; + unsigned core_count; + unsigned thread_tls_alloc; + unsigned quirks; + + /* Bitmask of supported compressed texture formats */ + uint32_t compressed_formats; + + /* debug flags, see pan_util.h how to interpret */ + unsigned debug; + + drmVersionPtr kernel_version; + + struct renderonly *ro; + + pthread_mutex_t bo_map_lock; + struct util_sparse_array bo_map; + + struct { + pthread_mutex_t lock; + + /* List containing all cached BOs sorted in LRU (Least + * Recently Used) order. This allows us to quickly evict BOs + * that are more than 1 second old. + */ + struct list_head lru; + + /* The BO cache is a set of buckets with power-of-two sizes + * ranging from 2^12 (4096, the page size) to + * 2^(12 + MAX_BO_CACHE_BUCKETS). + * Each bucket is a linked list of free panfrost_bo objects. */ + + struct list_head buckets[NR_BO_CACHE_BUCKETS]; + } bo_cache; + + struct pan_blit_shaders blit_shaders; +}; + +void +panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev); + +void +panfrost_close_device(struct panfrost_device *dev); + +bool +panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt); + +static inline struct panfrost_bo * +pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle) +{ + return util_sparse_array_get(&dev->bo_map, gem_handle); +} + +#endif diff --git a/src/panfrost/lib/pan_encoder.h b/src/panfrost/lib/pan_encoder.h new file mode 100644 index 00000000000..42ec8320acb --- /dev/null +++ b/src/panfrost/lib/pan_encoder.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +#ifndef __PAN_ENCODER_H +#define __PAN_ENCODER_H + +#include +#include "panfrost-job.h" + +/* Invocation packing */ + +void +panfrost_pack_work_groups_compute( + struct mali_vertex_tiler_prefix *out, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z, + bool quirk_graphics); + +void +panfrost_pack_work_groups_fused( + struct mali_vertex_tiler_prefix *vertex, + struct mali_vertex_tiler_prefix *tiler, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z); + +/* Tiler structure size computation */ + +unsigned +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); + +unsigned +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy); + +unsigned +panfrost_choose_hierarchy_mask( + unsigned width, unsigned height, + unsigned vertex_count, bool hierarchy); + +/* Stack sizes */ + +unsigned +panfrost_get_stack_shift(unsigned stack_size); + +unsigned +panfrost_get_total_stack_size( + unsigned stack_shift, + unsigned threads_per_core, + unsigned core_count); + +/* Property queries */ + + +unsigned panfrost_query_gpu_version(int fd); +unsigned panfrost_query_core_count(int fd); +unsigned panfrost_query_thread_tls_alloc(int fd); + +const char * panfrost_model_name(unsigned gpu_id); + +/* Attributes / instancing */ + +unsigned +panfrost_padded_vertex_count(unsigned vertex_count); + +unsigned +panfrost_vertex_instanced( + unsigned padded_count, + unsigned instance_shift, unsigned instance_odd, + unsigned divisor, + union mali_attr *attrs); + +void panfrost_vertex_id(unsigned padded_count, union mali_attr *attr); +void panfrost_instance_id(unsigned padded_count, union mali_attr *attr); + +/* Samplers */ + +enum mali_func +panfrost_flip_compare_func(enum mali_func f); + + + +#endif diff --git a/src/panfrost/lib/pan_format.c b/src/panfrost/lib/pan_format.c new file mode 100644 index 00000000000..76006164a36 --- /dev/null +++ b/src/panfrost/lib/pan_format.c @@ -0,0 +1,410 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include +#include "panfrost-job.h" +#include "pan_texture.h" + +/* Convenience */ + +#define _V PIPE_BIND_VERTEX_BUFFER +#define _T PIPE_BIND_SAMPLER_VIEW +#define _R PIPE_BIND_RENDER_TARGET +#define _Z PIPE_BIND_DEPTH_STENCIL +#define _VT (_V | _T) +#define _VTR (_V | _T | _R) +#define _TZ (_T | _Z) + +struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT] = { + [PIPE_FORMAT_ETC1_RGB8] = { MALI_ETC2_RGB8, _T }, + [PIPE_FORMAT_ETC2_RGB8] = { MALI_ETC2_RGB8, _T }, + [PIPE_FORMAT_ETC2_SRGB8] = { MALI_ETC2_RGB8, _T }, + [PIPE_FORMAT_ETC2_R11_UNORM] = { MALI_ETC2_R11_UNORM, _T }, + [PIPE_FORMAT_ETC2_RGBA8] = { MALI_ETC2_RGBA8, _T }, + [PIPE_FORMAT_ETC2_SRGBA8] = { MALI_ETC2_RGBA8, _T }, + [PIPE_FORMAT_ETC2_RG11_UNORM] = { MALI_ETC2_RG11_UNORM, _T }, + [PIPE_FORMAT_ETC2_R11_SNORM] = { MALI_ETC2_R11_SNORM, _T }, + [PIPE_FORMAT_ETC2_RG11_SNORM] = { MALI_ETC2_RG11_SNORM, _T }, + [PIPE_FORMAT_ETC2_RGB8A1] = { MALI_ETC2_RGB8A1, _T }, + [PIPE_FORMAT_ETC2_SRGB8A1] = { MALI_ETC2_RGB8A1, _T }, + + [PIPE_FORMAT_DXT1_RGB] = { MALI_BC1_UNORM, _T }, + [PIPE_FORMAT_DXT1_RGBA] = { MALI_BC1_UNORM, _T }, + [PIPE_FORMAT_DXT1_SRGB] = { MALI_BC1_UNORM, _T }, + [PIPE_FORMAT_DXT1_SRGBA] = { MALI_BC1_UNORM, _T }, + [PIPE_FORMAT_DXT3_RGBA] = { MALI_BC2_UNORM, _T }, + [PIPE_FORMAT_DXT3_SRGBA] = { MALI_BC2_UNORM, _T }, + [PIPE_FORMAT_DXT5_RGBA] = { MALI_BC3_UNORM, _T }, + [PIPE_FORMAT_DXT5_SRGBA] = { MALI_BC3_UNORM, _T }, + + [PIPE_FORMAT_RGTC1_UNORM] = { MALI_BC4_UNORM, _T }, + [PIPE_FORMAT_RGTC1_SNORM] = { MALI_BC4_SNORM, _T }, + [PIPE_FORMAT_RGTC2_UNORM] = { MALI_BC5_UNORM, _T }, + [PIPE_FORMAT_RGTC2_SNORM] = { MALI_BC5_SNORM, _T }, + + [PIPE_FORMAT_BPTC_RGB_FLOAT] = { MALI_BC6H_SF16, _T }, + [PIPE_FORMAT_BPTC_RGB_UFLOAT] = { MALI_BC6H_UF16, _T }, + [PIPE_FORMAT_BPTC_RGBA_UNORM] = { MALI_BC7_UNORM, _T }, + [PIPE_FORMAT_BPTC_SRGBA] = { MALI_BC7_UNORM, _T }, + + [PIPE_FORMAT_ASTC_4x4] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_5x4] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_5x5] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_6x5] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_6x6] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_8x5] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_8x6] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_8x8] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_10x5] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_10x6] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_10x8] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_10x10] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_12x10] = { MALI_ASTC_2D_HDR, _T }, + [PIPE_FORMAT_ASTC_12x12] = { MALI_ASTC_2D_HDR, _T }, + + [PIPE_FORMAT_ASTC_4x4_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_5x4_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_5x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_6x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_6x6_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_8x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_8x6_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_8x8_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_10x5_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_10x6_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_10x8_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_10x10_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_12x10_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_ASTC_12x12_SRGB] = { MALI_ASTC_2D_LDR, _T }, + [PIPE_FORMAT_B5G6R5_UNORM] = { MALI_RGB565, _VTR }, + [PIPE_FORMAT_B5G5R5X1_UNORM] = { MALI_RGB5_X1_UNORM, _VT }, + [PIPE_FORMAT_R5G5B5A1_UNORM] = { MALI_RGB5_A1_UNORM, _VTR }, + + [PIPE_FORMAT_R10G10B10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, + [PIPE_FORMAT_B10G10R10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, + [PIPE_FORMAT_R10G10B10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, + [PIPE_FORMAT_B10G10R10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR }, + [PIPE_FORMAT_R10G10B10X2_SNORM] = { MALI_RGB10_A2_SNORM, _VT }, + [PIPE_FORMAT_R10G10B10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT }, + [PIPE_FORMAT_B10G10R10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT }, + [PIPE_FORMAT_R10G10B10A2_UINT] = { MALI_RGB10_A2UI, _VTR }, + [PIPE_FORMAT_B10G10R10A2_UINT] = { MALI_RGB10_A2UI, _VTR }, + [PIPE_FORMAT_R10G10B10A2_USCALED] = { MALI_RGB10_A2UI, _VTR }, + [PIPE_FORMAT_B10G10R10A2_USCALED] = { MALI_RGB10_A2UI, _VTR }, + [PIPE_FORMAT_R10G10B10A2_SINT] = { MALI_RGB10_A2I, _VTR}, + [PIPE_FORMAT_B10G10R10A2_SINT] = { MALI_RGB10_A2I, _VTR }, + [PIPE_FORMAT_R10G10B10A2_SSCALED] = { MALI_RGB10_A2I, _VTR }, + [PIPE_FORMAT_B10G10R10A2_SSCALED] = { MALI_RGB10_A2I, _VTR }, + + [PIPE_FORMAT_R8_SSCALED] = { MALI_R8I, _V }, + [PIPE_FORMAT_R8G8_SSCALED] = { MALI_RG8I, _V }, + [PIPE_FORMAT_R8G8B8_SSCALED] = { MALI_RGB8I, _V }, + [PIPE_FORMAT_B8G8R8_SSCALED] = { MALI_RGB8I, _V }, + [PIPE_FORMAT_R8G8B8A8_SSCALED] = { MALI_RGBA8I, _V }, + [PIPE_FORMAT_B8G8R8A8_SSCALED] = { MALI_RGBA8I, _V }, + [PIPE_FORMAT_A8B8G8R8_SSCALED] = { MALI_RGBA8I, _V }, + + [PIPE_FORMAT_R8_USCALED] = { MALI_R8UI, _V }, + [PIPE_FORMAT_R8G8_USCALED] = { MALI_RG8UI, _V }, + [PIPE_FORMAT_R8G8B8_USCALED] = { MALI_RGB8UI, _V }, + [PIPE_FORMAT_B8G8R8_USCALED] = { MALI_RGB8UI, _V }, + [PIPE_FORMAT_R8G8B8A8_USCALED] = { MALI_RGBA8UI, _V }, + [PIPE_FORMAT_B8G8R8A8_USCALED] = { MALI_RGBA8UI, _V }, + [PIPE_FORMAT_A8B8G8R8_USCALED] = { MALI_RGBA8UI, _V }, + + [PIPE_FORMAT_R16_USCALED] = { MALI_R16UI, _V }, + [PIPE_FORMAT_R16G16_USCALED] = { MALI_RG16UI, _V }, + [PIPE_FORMAT_R16G16B16_USCALED] = { MALI_RGB16UI, _V }, + [PIPE_FORMAT_R16G16B16A16_USCALED] = { MALI_RGBA16UI, _V }, + [PIPE_FORMAT_R16_SSCALED] = { MALI_R16I, _V }, + [PIPE_FORMAT_R16G16_SSCALED] = { MALI_RG16I, _V }, + [PIPE_FORMAT_R16G16B16_SSCALED] = { MALI_RGB16I, _V }, + [PIPE_FORMAT_R16G16B16A16_SSCALED] = { MALI_RGBA16I, _V }, + + [PIPE_FORMAT_R32_USCALED] = { MALI_R32UI, _V }, + [PIPE_FORMAT_R32G32_USCALED] = { MALI_RG32UI, _V }, + [PIPE_FORMAT_R32G32B32_USCALED] = { MALI_RGB32UI, _V }, + [PIPE_FORMAT_R32G32B32A32_USCALED] = { MALI_RGBA32UI, _V }, + [PIPE_FORMAT_R32_SSCALED] = { MALI_R32I, _V }, + [PIPE_FORMAT_R32G32_SSCALED] = { MALI_RG32I, _V }, + [PIPE_FORMAT_R32G32B32_SSCALED] = { MALI_RGB32I, _V }, + [PIPE_FORMAT_R32G32B32A32_SSCALED] = { MALI_RGBA32I, _V }, + + [PIPE_FORMAT_R3G3B2_UNORM] = { MALI_RGB332_UNORM, _VT }, + + [PIPE_FORMAT_Z24_UNORM_S8_UINT] = { MALI_Z24X8_UNORM, _TZ }, + [PIPE_FORMAT_Z24X8_UNORM] = { MALI_Z24X8_UNORM, _TZ }, + [PIPE_FORMAT_Z32_FLOAT] = { MALI_R32F, _TZ }, + [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = { MALI_R32F, _TZ }, + [PIPE_FORMAT_X32_S8X24_UINT] = { MALI_R8UI, _T }, + [PIPE_FORMAT_X24S8_UINT] = { MALI_RGBA8UI, _TZ }, + [PIPE_FORMAT_S8_UINT] = { MALI_R8UI, _T }, + + [PIPE_FORMAT_R32_FIXED] = { MALI_R32_FIXED, _V }, + [PIPE_FORMAT_R32G32_FIXED] = { MALI_RG32_FIXED, _V }, + [PIPE_FORMAT_R32G32B32_FIXED] = { MALI_RGB32_FIXED, _V }, + [PIPE_FORMAT_R32G32B32A32_FIXED] = { MALI_RGBA32_FIXED, _V }, + + [PIPE_FORMAT_R11G11B10_FLOAT] = { MALI_R11F_G11F_B10F, _VTR}, + [PIPE_FORMAT_R9G9B9E5_FLOAT] = { MALI_R9F_G9F_B9F_E5F, _VT }, + + [PIPE_FORMAT_R8_SNORM] = { MALI_R8_SNORM, _VT }, + [PIPE_FORMAT_R16_SNORM] = { MALI_R16_SNORM, _VT }, + [PIPE_FORMAT_R32_SNORM] = { MALI_R32_SNORM, _VT }, + [PIPE_FORMAT_R8G8_SNORM] = { MALI_RG8_SNORM, _VT }, + [PIPE_FORMAT_R16G16_SNORM] = { MALI_RG16_SNORM, _VT }, + [PIPE_FORMAT_R32G32_SNORM] = { MALI_RG32_SNORM, _VT }, + [PIPE_FORMAT_R8G8B8_SNORM] = { MALI_RGB8_SNORM, _VT }, + [PIPE_FORMAT_R16G16B16_SNORM] = { MALI_RGB16_SNORM, _VT }, + [PIPE_FORMAT_R32G32B32_SNORM] = { MALI_RGB32_SNORM, _VT }, + [PIPE_FORMAT_R8G8B8A8_SNORM] = { MALI_RGBA8_SNORM, _VT }, + [PIPE_FORMAT_R16G16B16A16_SNORM] = { MALI_RGBA16_SNORM, _VT }, + [PIPE_FORMAT_R32G32B32A32_SNORM] = { MALI_RGBA32_SNORM, _VT }, + + [PIPE_FORMAT_A8_SINT] = { MALI_R8I, _VTR }, + [PIPE_FORMAT_I8_SINT] = { MALI_R8I, _VTR }, + [PIPE_FORMAT_L8_SINT] = { MALI_R8I, _VTR }, + [PIPE_FORMAT_L8A8_SINT] = { MALI_RG8I, _VTR }, + [PIPE_FORMAT_A8_UINT] = { MALI_R8UI, _VTR }, + [PIPE_FORMAT_I8_UINT] = { MALI_R8UI, _VTR }, + [PIPE_FORMAT_L8_UINT] = { MALI_R8UI, _VTR }, + [PIPE_FORMAT_L8A8_UINT] = { MALI_RG8UI, _VTR }, + + [PIPE_FORMAT_A16_SINT] = { MALI_R16I, _VTR }, + [PIPE_FORMAT_I16_SINT] = { MALI_R16I, _VTR }, + [PIPE_FORMAT_L16_SINT] = { MALI_R16I, _VTR }, + [PIPE_FORMAT_L16A16_SINT] = { MALI_RG16I, _VTR }, + [PIPE_FORMAT_A16_UINT] = { MALI_R16UI, _VTR }, + [PIPE_FORMAT_I16_UINT] = { MALI_R16UI, _VTR }, + [PIPE_FORMAT_L16_UINT] = { MALI_R16UI, _VTR }, + [PIPE_FORMAT_L16A16_UINT] = { MALI_RG16UI, _VTR }, + + [PIPE_FORMAT_A32_SINT] = { MALI_R32I, _VTR }, + [PIPE_FORMAT_I32_SINT] = { MALI_R32I, _VTR }, + [PIPE_FORMAT_L32_SINT] = { MALI_R32I, _VTR }, + [PIPE_FORMAT_L32A32_SINT] = { MALI_RG32I, _VTR }, + [PIPE_FORMAT_A32_UINT] = { MALI_R32UI, _VTR }, + [PIPE_FORMAT_I32_UINT] = { MALI_R32UI, _VTR }, + [PIPE_FORMAT_L32_UINT] = { MALI_R32UI, _VTR }, + [PIPE_FORMAT_L32A32_UINT] = { MALI_RG32UI, _VTR }, + + [PIPE_FORMAT_B8G8R8_UINT] = { MALI_RGB8UI, _VTR }, + [PIPE_FORMAT_B8G8R8A8_UINT] = { MALI_RGBA8UI, _VTR }, + [PIPE_FORMAT_B8G8R8_SINT] = { MALI_RGB8I, _VTR }, + [PIPE_FORMAT_B8G8R8A8_SINT] = { MALI_RGBA8I, _VTR }, + [PIPE_FORMAT_A8R8G8B8_UINT] = { MALI_RGBA8UI, _VTR }, + [PIPE_FORMAT_A8B8G8R8_UINT] = { MALI_RGBA8UI, _VTR }, + + [PIPE_FORMAT_R8_UINT] = { MALI_R8UI, _VTR }, + [PIPE_FORMAT_R16_UINT] = { MALI_R16UI, _VTR }, + [PIPE_FORMAT_R32_UINT] = { MALI_R32UI, _VTR }, + [PIPE_FORMAT_R8G8_UINT] = { MALI_RG8UI, _VTR }, + [PIPE_FORMAT_R16G16_UINT] = { MALI_RG16UI, _VTR }, + [PIPE_FORMAT_R32G32_UINT] = { MALI_RG32UI, _VTR }, + [PIPE_FORMAT_R8G8B8_UINT] = { MALI_RGB8UI, _VTR }, + [PIPE_FORMAT_R16G16B16_UINT] = { MALI_RGB16UI, _VTR }, + [PIPE_FORMAT_R32G32B32_UINT] = { MALI_RGB32UI, _VTR }, + [PIPE_FORMAT_R8G8B8A8_UINT] = { MALI_RGBA8UI, _VTR }, + [PIPE_FORMAT_R16G16B16A16_UINT] = { MALI_RGBA16UI, _VTR }, + [PIPE_FORMAT_R32G32B32A32_UINT] = { MALI_RGBA32UI, _VTR }, + + [PIPE_FORMAT_R32_FLOAT] = { MALI_R32F, _VTR }, + [PIPE_FORMAT_R32G32_FLOAT] = { MALI_RG32F, _VTR }, + [PIPE_FORMAT_R32G32B32_FLOAT] = { MALI_RGB32F, _VTR }, + [PIPE_FORMAT_R32G32B32A32_FLOAT] = { MALI_RGBA32F, _VTR }, + + [PIPE_FORMAT_R8_UNORM] = { MALI_R8_UNORM, _VTR }, + [PIPE_FORMAT_R16_UNORM] = { MALI_R16_UNORM, _VTR }, + [PIPE_FORMAT_R32_UNORM] = { MALI_R32_UNORM, _VTR }, + [PIPE_FORMAT_R8G8_UNORM] = { MALI_RG8_UNORM, _VTR }, + [PIPE_FORMAT_R16G16_UNORM] = { MALI_RG16_UNORM, _VTR }, + [PIPE_FORMAT_R32G32_UNORM] = { MALI_RG32_UNORM, _VTR }, + [PIPE_FORMAT_R8G8B8_UNORM] = { MALI_RGB8_UNORM, _VTR }, + [PIPE_FORMAT_R16G16B16_UNORM] = { MALI_RGB16_UNORM, _VTR }, + [PIPE_FORMAT_R32G32B32_UNORM] = { MALI_RGB32_UNORM, _VTR }, + [PIPE_FORMAT_R4G4B4A4_UNORM] = { MALI_RGBA4_UNORM, _VTR }, + [PIPE_FORMAT_R16G16B16A16_UNORM] = { MALI_RGBA16_UNORM, _VTR }, + [PIPE_FORMAT_R32G32B32A32_UNORM] = { MALI_RGBA32_UNORM, _VTR }, + + [PIPE_FORMAT_B8G8R8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_B8G8R8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_A8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_X8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_A8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_X8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_R8G8B8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_R8G8B8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR }, + + [PIPE_FORMAT_R8G8B8X8_SNORM] = { MALI_RGBA8_SNORM, _VT }, + [PIPE_FORMAT_R8G8B8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_R8G8B8X8_UINT] = { MALI_RGBA8UI, _VTR }, + [PIPE_FORMAT_R8G8B8X8_SINT] = { MALI_RGBA8I, _VTR }, + + [PIPE_FORMAT_L8_UNORM] = { MALI_R8_UNORM, _VTR }, + [PIPE_FORMAT_A8_UNORM] = { MALI_R8_UNORM, _VTR }, + [PIPE_FORMAT_I8_UNORM] = { MALI_R8_UNORM, _VTR }, + [PIPE_FORMAT_L8A8_UNORM] = { MALI_RG8_UNORM, _VTR }, + [PIPE_FORMAT_L16_UNORM] = { MALI_R16_UNORM, _VTR }, + [PIPE_FORMAT_A16_UNORM] = { MALI_R16_UNORM, _VTR }, + [PIPE_FORMAT_I16_UNORM] = { MALI_R16_UNORM, _VTR }, + [PIPE_FORMAT_L16A16_UNORM] = { MALI_RG16_UNORM, _VTR }, + + [PIPE_FORMAT_L8_SNORM] = { MALI_R8_SNORM, _VT }, + [PIPE_FORMAT_A8_SNORM] = { MALI_R8_SNORM, _VT }, + [PIPE_FORMAT_I8_SNORM] = { MALI_R8_SNORM, _VT }, + [PIPE_FORMAT_L8A8_SNORM] = { MALI_RG8_SNORM, _VT }, + [PIPE_FORMAT_L16_SNORM] = { MALI_R16_SNORM, _VT }, + [PIPE_FORMAT_A16_SNORM] = { MALI_R16_SNORM, _VT }, + [PIPE_FORMAT_I16_SNORM] = { MALI_R16_SNORM, _VT }, + [PIPE_FORMAT_L16A16_SNORM] = { MALI_RG16_SNORM, _VT }, + + [PIPE_FORMAT_L16_FLOAT] = { MALI_R16F, _VTR }, + [PIPE_FORMAT_A16_FLOAT] = { MALI_R16F, _VTR }, + [PIPE_FORMAT_I16_FLOAT] = { MALI_RG16F, _VTR }, + [PIPE_FORMAT_L16A16_FLOAT] = { MALI_RG16F, _VTR }, + + [PIPE_FORMAT_L8_SRGB] = { MALI_R8_UNORM, _VTR }, + [PIPE_FORMAT_R8_SRGB] = { MALI_R8_UNORM, _VTR }, + [PIPE_FORMAT_L8A8_SRGB] = { MALI_RG8_UNORM, _VTR }, + [PIPE_FORMAT_R8G8_SRGB] = { MALI_RG8_UNORM, _VTR }, + [PIPE_FORMAT_R8G8B8_SRGB] = { MALI_RGB8_UNORM, _VTR }, + [PIPE_FORMAT_B8G8R8_SRGB] = { MALI_RGB8_UNORM, _VTR }, + [PIPE_FORMAT_R8G8B8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_A8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_X8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_B8G8R8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_B8G8R8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_A8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + [PIPE_FORMAT_X8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR }, + + [PIPE_FORMAT_R8_SINT] = { MALI_R8I, _VTR }, + [PIPE_FORMAT_R16_SINT] = { MALI_R16I, _VTR }, + [PIPE_FORMAT_R32_SINT] = { MALI_R32I, _VTR }, + [PIPE_FORMAT_R16_FLOAT] = { MALI_R16F, _VTR }, + [PIPE_FORMAT_R8G8_SINT] = { MALI_RG8I, _VTR }, + [PIPE_FORMAT_R16G16_SINT] = { MALI_RG16I, _VTR }, + [PIPE_FORMAT_R32G32_SINT] = { MALI_RG32I, _VTR }, + [PIPE_FORMAT_R16G16_FLOAT] = { MALI_RG16F, _VTR }, + [PIPE_FORMAT_R8G8B8_SINT] = { MALI_RGB8I, _VTR }, + [PIPE_FORMAT_R16G16B16_SINT] = { MALI_RGB16I, _VTR }, + [PIPE_FORMAT_R32G32B32_SINT] = { MALI_RGB32I, _VTR }, + [PIPE_FORMAT_R16G16B16_FLOAT] = { MALI_RGB16F, _VTR }, + [PIPE_FORMAT_R8G8B8A8_SINT] = { MALI_RGBA8I, _VTR }, + [PIPE_FORMAT_R16G16B16A16_SINT] = { MALI_RGBA16I, _VTR }, + [PIPE_FORMAT_R32G32B32A32_SINT] = { MALI_RGBA32I, _VTR }, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = { MALI_RGBA16F, _VTR }, + + [PIPE_FORMAT_R16G16B16X16_UNORM] = { MALI_RGBA16_UNORM, _VTR }, + [PIPE_FORMAT_R16G16B16X16_SNORM] = { MALI_RGBA16_SNORM, _VT }, + [PIPE_FORMAT_R16G16B16X16_FLOAT] = { MALI_RGBA16F, _VTR }, + [PIPE_FORMAT_R16G16B16X16_UINT] = { MALI_RGBA16UI, _VTR }, + [PIPE_FORMAT_R16G16B16X16_SINT] = { MALI_RGBA16I, _VTR }, + + [PIPE_FORMAT_R32G32B32X32_FLOAT] = { MALI_RGBA32F, _VTR }, + [PIPE_FORMAT_R32G32B32X32_UINT] = { MALI_RGBA32UI, _VTR }, + [PIPE_FORMAT_R32G32B32X32_SINT] = { MALI_RGBA32I, _VTR }, +}; + +#undef _VTR +#undef _VT +#undef _V +#undef _T +#undef _R + +/* Is a format encoded like Z24S8 and therefore compatible for render? */ + +bool +panfrost_is_z24s8_variant(enum pipe_format fmt) +{ + switch (fmt) { + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_Z24X8_UNORM: + return true; + default: + return false; + } +} + +/* Translate a PIPE swizzle quad to a 12-bit Mali swizzle code. PIPE + * swizzles line up with Mali swizzles for the XYZW01, but PIPE swizzles have + * an additional "NONE" field that we have to mask out to zero. Additionally, + * PIPE swizzles are sparse but Mali swizzles are packed */ + +unsigned +panfrost_translate_swizzle_4(const unsigned char swizzle[4]) +{ + unsigned out = 0; + + for (unsigned i = 0; i < 4; ++i) { + unsigned translated = (swizzle[i] > PIPE_SWIZZLE_1) ? PIPE_SWIZZLE_0 : swizzle[i]; + out |= (translated << (3*i)); + } + + return out; +} + +void +panfrost_invert_swizzle(const unsigned char *in, unsigned char *out) +{ + /* First, default to all zeroes to prevent uninitialized junk */ + + for (unsigned c = 0; c < 4; ++c) + out[c] = PIPE_SWIZZLE_0; + + /* Now "do" what the swizzle says */ + + for (unsigned c = 0; c < 4; ++c) { + unsigned char i = in[c]; + + /* Who cares? */ + assert(PIPE_SWIZZLE_X == 0); + if (i > PIPE_SWIZZLE_W) + continue; + + /* Invert */ + unsigned idx = i - PIPE_SWIZZLE_X; + out[idx] = PIPE_SWIZZLE_X + c; + } +} + +enum mali_format +panfrost_format_to_bifrost_blend(const struct util_format_description *desc) +{ + enum mali_format format = panfrost_pipe_format_table[desc->format].hw; + assert(format); + + switch (format) { + case MALI_RGBA4_UNORM: + return MALI_RGBA4; + case MALI_RGBA8_UNORM: + case MALI_RGB8_UNORM: + return MALI_RGBA8_2; + case MALI_RGB10_A2_UNORM: + return MALI_RGB10_A2_2; + default: + return format; + } +} diff --git a/src/panfrost/lib/pan_invocation.c b/src/panfrost/lib/pan_invocation.c new file mode 100644 index 00000000000..d86b16a2643 --- /dev/null +++ b/src/panfrost/lib/pan_invocation.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + * + */ + +#include +#include "util/u_math.h" +#include "pan_encoder.h" + +/* Compute shaders are invoked with a gl_NumWorkGroups X/Y/Z triplet. Vertex + * shaders, it turns out, are invoked with the same mechanism, with the triplet + * (1, vertex_count, instance_count). + * + * Alongside this triplet is the gl_WorkGroupSize X/Y/Z triplet. + * + * Unfortunately, the packing for these triplet into the + * mali_vertex_tiler_prefix is a little funky, using a dynamic bitfield. The + * routines here exist to pack this */ + +void +panfrost_pack_work_groups_compute( + struct mali_vertex_tiler_prefix *out, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z, + bool quirk_graphics) +{ + uint32_t packed = 0; + + /* The values needing packing, in order, and the corresponding shifts. + * Indicies into shift are off-by-one to make the logic easier */ + + unsigned shifts[7] = { 0 }; + + unsigned values[6] = { + MALI_POSITIVE(size_x), + MALI_POSITIVE(size_y), + MALI_POSITIVE(size_z), + MALI_POSITIVE(num_x), + MALI_POSITIVE(num_y), + MALI_POSITIVE(num_z), + }; + + for (unsigned i = 0; i < 6; ++i) { + /* OR it in, shifting as required */ + packed |= (values[i] << shifts[i]); + + /* How many bits did we use? */ + unsigned bit_count = util_logbase2_ceil(values[i] + 1); + + /* Set the next shift accordingly */ + shifts[i + 1] = shifts[i] + bit_count; + } + + /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift + * = 32. This doesn't appear to matter to the hardware, but it's good + * to be bit-identical. */ + + if (quirk_graphics && (num_z <= 1)) + shifts[5] = 32; + + /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2, + * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL + * compute, it is always 2 if no barriers are in use, but is equal to + * workgroups_x_shift is barriers are in use. */ + + unsigned shift_2 = shifts[3]; + + if (quirk_graphics) + shift_2 = MAX2(shift_2, 2); + + /* Pack them in */ + uint32_t packed_shifts = + (shifts[1] << 0) | + (shifts[2] << 5) | + (shifts[3] << 10) | + (shifts[4] << 16) | + (shifts[5] << 22) | + (shift_2 << 28); + + /* Upload the packed bitfields */ + out->invocation_count = packed; + out->invocation_shifts = packed_shifts; + + /* TODO: Compute workgroups_x_shift_3 */ + out->workgroups_x_shift_3 = shift_2; +} + +/* Packs vertex/tiler descriptors simultaneously */ +void +panfrost_pack_work_groups_fused( + struct mali_vertex_tiler_prefix *vertex, + struct mali_vertex_tiler_prefix *tiler, + unsigned num_x, + unsigned num_y, + unsigned num_z, + unsigned size_x, + unsigned size_y, + unsigned size_z) +{ + panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z, true); + + /* Copy results over */ + tiler->invocation_count = vertex->invocation_count; + tiler->invocation_shifts = vertex->invocation_shifts; + + /* Set special fields for each */ + vertex->workgroups_x_shift_3 = 5; + tiler->workgroups_x_shift_3 = 6; +} + diff --git a/src/panfrost/lib/pan_pool.c b/src/panfrost/lib/pan_pool.c new file mode 100644 index 00000000000..1a08be2aacf --- /dev/null +++ b/src/panfrost/lib/pan_pool.c @@ -0,0 +1,114 @@ +/* + * © Copyright 2018 Alyssa Rosenzweig + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "util/hash_table.h" +#include "pan_bo.h" +#include "pan_pool.h" + +/* TODO: What does this actually have to be? */ +#define ALIGNMENT 128 + +/* Transient command stream pooling: command stream uploads try to simply copy + * into whereever we left off. If there isn't space, we allocate a new entry + * into the pool and copy there */ + +struct pan_pool +panfrost_create_pool(void *memctx, struct panfrost_device *dev) +{ + struct pan_pool pool = { + .dev = dev, + .transient_offset = 0, + .transient_bo = NULL + }; + + pool.bos = _mesa_hash_table_create(memctx, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + + return pool; +} + +struct panfrost_transfer +panfrost_pool_alloc(struct pan_pool *pool, size_t sz) +{ + /* Pad the size */ + sz = ALIGN_POT(sz, ALIGNMENT); + + /* Find or create a suitable BO */ + struct panfrost_bo *bo = NULL; + + unsigned offset = 0; + + bool fits_in_current = (pool->transient_offset + sz) < TRANSIENT_SLAB_SIZE; + + if (likely(pool->transient_bo && fits_in_current)) { + /* We can reuse the current BO, so get it */ + bo = pool->transient_bo; + + /* Use the specified offset */ + offset = pool->transient_offset; + pool->transient_offset = offset + sz; + } else { + size_t bo_sz = sz < TRANSIENT_SLAB_SIZE ? + TRANSIENT_SLAB_SIZE : ALIGN_POT(sz, 4096); + + /* We can't reuse the current BO, but we can create a new one. + * We don't know what the BO will be used for, so let's flag it + * RW and attach it to both the fragment and vertex/tiler jobs. + * TODO: if we want fine grained BO assignment we should pass + * flags to this function and keep the read/write, + * fragment/vertex+tiler pools separate. + */ + bo = panfrost_bo_create(pool->dev, bo_sz, 0); + + uintptr_t flags = PAN_BO_ACCESS_PRIVATE | + PAN_BO_ACCESS_RW | + PAN_BO_ACCESS_VERTEX_TILER | + PAN_BO_ACCESS_FRAGMENT; + + _mesa_hash_table_insert(pool->bos, bo, (void *) flags); + + if (sz < TRANSIENT_SLAB_SIZE) { + pool->transient_bo = bo; + pool->transient_offset = offset + sz; + } + } + + struct panfrost_transfer ret = { + .cpu = bo->cpu + offset, + .gpu = bo->gpu + offset, + }; + + return ret; + +} + +mali_ptr +panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz) +{ + struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sz); + memcpy(transfer.cpu, data, sz); + return transfer.gpu; +} diff --git a/src/panfrost/lib/pan_pool.h b/src/panfrost/lib/pan_pool.h new file mode 100644 index 00000000000..14593eabd43 --- /dev/null +++ b/src/panfrost/lib/pan_pool.h @@ -0,0 +1,67 @@ +/* + * © Copyright 2017-2018 Alyssa Rosenzweig + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PAN_POOL_H__ +#define __PAN_POOL_H__ + +#include +#include + +/* Represents a pool of memory that can only grow, used to allocate objects + * with the same lifetime as the pool itself. In OpenGL, a pool is owned by the + * batch for transient structures. In Vulkan, it may be owned by e.g. the + * command pool */ + +struct pan_pool { + /* Parent device for allocation */ + struct panfrost_device *dev; + + /* panfrost_bo -> access_flags owned by the pool */ + struct hash_table *bos; + + /* Current transient BO */ + struct panfrost_bo *transient_bo; + + /* Within the topmost transient BO, how much has been used? */ + unsigned transient_offset; +}; + +struct pan_pool +panfrost_create_pool(void *memctx, struct panfrost_device *dev); + +/* Represents a fat pointer for GPU-mapped memory, returned from the transient + * allocator and not used for much else */ + +struct panfrost_transfer { + uint8_t *cpu; + mali_ptr gpu; +}; + +struct panfrost_transfer +panfrost_pool_alloc(struct pan_pool *pool, size_t sz); + +mali_ptr +panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz); + +#endif diff --git a/src/panfrost/lib/pan_props.c b/src/panfrost/lib/pan_props.c new file mode 100644 index 00000000000..a4ff28506df --- /dev/null +++ b/src/panfrost/lib/pan_props.c @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include + +#include "util/u_math.h" +#include "util/macros.h" +#include "util/hash_table.h" +#include "util/u_thread.h" +#include "drm-uapi/panfrost_drm.h" +#include "pan_encoder.h" +#include "pan_device.h" +#include "panfrost-quirks.h" +#include "pan_bo.h" + +/* Abstraction over the raw drm_panfrost_get_param ioctl for fetching + * information about devices */ + +static __u64 +panfrost_query_raw( + int fd, + enum drm_panfrost_param param, + bool required, + unsigned default_value) +{ + struct drm_panfrost_get_param get_param = {0,}; + ASSERTED int ret; + + get_param.param = param; + ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param); + + if (ret) { + assert(!required); + return default_value; + } + + return get_param.value; +} + +unsigned +panfrost_query_gpu_version(int fd) +{ + return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0); +} + +unsigned +panfrost_query_core_count(int fd) +{ + /* On older kernels, worst-case to 16 cores */ + + unsigned mask = panfrost_query_raw(fd, + DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff); + + return util_bitcount(mask); +} + +unsigned +panfrost_query_thread_tls_alloc(int fd) +{ + /* On older kernels, we worst-case to 256 threads, the architectural + * maximum for Midgard. On my current kernel/hardware, I'm seeing this + * readback as 0, so we'll worst-case there too */ + + unsigned tls = panfrost_query_raw(fd, + DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 256); + + if (tls) + return tls; + else + return 256; +} + +static uint32_t +panfrost_query_compressed_formats(int fd) +{ + /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and + * should exist on any Mali configuration. All hardware should report + * these texture formats but the kernel might not be new enough. */ + + uint32_t default_set = + (1 << MALI_ETC2_RGB8) | + (1 << MALI_ETC2_R11_UNORM) | + (1 << MALI_ETC2_RGBA8) | + (1 << MALI_ETC2_RG11_UNORM) | + (1 << MALI_ETC2_R11_SNORM) | + (1 << MALI_ETC2_RG11_SNORM) | + (1 << MALI_ETC2_RGB8A1) | + (1 << MALI_ASTC_3D_LDR) | + (1 << MALI_ASTC_3D_HDR) | + (1 << MALI_ASTC_2D_LDR) | + (1 << MALI_ASTC_2D_HDR); + + return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0, + false, default_set); +} + +/* DRM_PANFROST_PARAM_TEXTURE_FEATURES0 will return a bitmask of supported + * compressed formats, so we offer a helper to test if a format is supported */ + +bool +panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt) +{ + if (MALI_EXTRACT_TYPE(fmt) != MALI_FORMAT_COMPRESSED) + return true; + + unsigned idx = fmt & ~MALI_FORMAT_COMPRESSED; + assert(idx < 32); + + return dev->compressed_formats & (1 << idx); +} + +/* Given a GPU ID like 0x860, return a prettified model name */ + +const char * +panfrost_model_name(unsigned gpu_id) +{ + switch (gpu_id) { + case 0x600: return "Mali T600 (Panfrost)"; + case 0x620: return "Mali T620 (Panfrost)"; + case 0x720: return "Mali T720 (Panfrost)"; + case 0x820: return "Mali T820 (Panfrost)"; + case 0x830: return "Mali T830 (Panfrost)"; + case 0x750: return "Mali T760 (Panfrost)"; + case 0x860: return "Mali T860 (Panfrost)"; + case 0x880: return "Mali T880 (Panfrost)"; + case 0x7093: return "Mali G31 (Panfrost)"; + case 0x7212: return "Mali G52 (Panfrost)"; + default: + unreachable("Invalid GPU ID"); + } +} + +void +panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev) +{ + dev->fd = fd; + dev->memctx = memctx; + dev->gpu_id = panfrost_query_gpu_version(fd); + dev->core_count = panfrost_query_core_count(fd); + dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd); + dev->kernel_version = drmGetVersion(fd); + dev->quirks = panfrost_get_quirks(dev->gpu_id); + dev->compressed_formats = panfrost_query_compressed_formats(fd); + + util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512); + + pthread_mutex_init(&dev->bo_cache.lock, NULL); + list_inithead(&dev->bo_cache.lru); + + for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) + list_inithead(&dev->bo_cache.buckets[i]); +} + +void +panfrost_close_device(struct panfrost_device *dev) +{ + panfrost_bo_unreference(dev->blit_shaders.bo); + panfrost_bo_cache_evict_all(dev); + pthread_mutex_destroy(&dev->bo_cache.lock); + drmFreeVersion(dev->kernel_version); + util_sparse_array_finish(&dev->bo_map); + +} diff --git a/src/panfrost/lib/pan_sampler.c b/src/panfrost/lib/pan_sampler.c new file mode 100644 index 00000000000..63ddd17b816 --- /dev/null +++ b/src/panfrost/lib/pan_sampler.c @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "pan_encoder.h" + +/* Sampler comparison functions are flipped in OpenGL from the hardware, so we + * need to be able to flip accordingly */ + +enum mali_func +panfrost_flip_compare_func(enum mali_func f) +{ + switch (f) { + case MALI_FUNC_LESS: + return MALI_FUNC_GREATER; + case MALI_FUNC_GREATER: + return MALI_FUNC_LESS; + case MALI_FUNC_LEQUAL: + return MALI_FUNC_GEQUAL; + case MALI_FUNC_GEQUAL: + return MALI_FUNC_LEQUAL; + default: + return f; + } +} diff --git a/src/panfrost/lib/pan_scoreboard.c b/src/panfrost/lib/pan_scoreboard.c new file mode 100644 index 00000000000..c72c9a37c3a --- /dev/null +++ b/src/panfrost/lib/pan_scoreboard.c @@ -0,0 +1,202 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include "pan_scoreboard.h" +#include "pan_device.h" +#include "panfrost-quirks.h" + +/* + * There are various types of Mali jobs: + * + * - WRITE_VALUE: generic write primitive, used to zero tiler field + * - VERTEX: runs a vertex shader + * - TILER: runs tiling and sets up a fragment shader + * - FRAGMENT: runs fragment shaders and writes out + * - COMPUTE: runs a compute shader + * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost) + * - GEOMETRY: runs a geometry shader (unimplemented) + * - CACHE_FLUSH: unseen in the wild, theoretically cache flush + * + * In between a full batch and a single Mali job is the "job chain", a series + * of Mali jobs together forming a linked list. Within the job chain, each Mali + * job can set (up to) two dependencies on other earlier jobs in the chain. + * This dependency graph forms a scoreboard. The general idea of a scoreboard + * applies: when there is a data dependency of job B on job A, job B sets one + * of its dependency indices to job A, ensuring that job B won't start until + * job A finishes. + * + * More specifically, here are a set of rules: + * + * - A write value job must appear if and only if there is at least one tiler + * job, and tiler jobs must depend on it. + * + * - Vertex jobs and tiler jobs are independent. + * + * - A tiler job must have a dependency on its data source. If it's getting + * data from a vertex job, it depends on the vertex job. If it's getting data + * from software, this is null. + * + * - Tiler jobs must depend on the write value job (chained or otherwise). + * + * - Tiler jobs must be strictly ordered. So each tiler job must depend on the + * previous job in the chain. + * + * - Jobs linking via next_job has no bearing on order of execution, rather it + * just establishes the linked list of jobs, EXCEPT: + * + * - A job's dependencies must appear earlier in the linked list (job chain). + * + * Justification for each rule: + * + * - Write value jobs are used to write a zero into a magic tiling field, which + * enables tiling to work. If tiling occurs, they are needed; if it does not, + * we cannot emit them since then tiling partially occurs and it's bad. + * + * - The hardware has no notion of a "vertex/tiler job" (at least not our + * hardware -- other revs have fused jobs, but --- crap, this just got even + * more complicated). They are independent units that take in data, process + * it, and spit out data. + * + * - Any job must depend on its data source, in fact, or risk a + * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo + * tiler jobs depend on the corresponding vertex job (if it's there). + * + * - The tiler is not thread-safe; this dependency prevents race conditions + * between two different jobs trying to write to the tiler outputs at the + * same time. + * + * - Internally, jobs are scoreboarded; the next job fields just form a linked + * list to allow the jobs to be read in; the execution order is from + * resolving the dependency fields instead. + * + * - The hardware cannot set a dependency on a job it doesn't know about yet, + * and dependencies are processed in-order of the next job fields. + * + */ + +/* Generates, uploads, and queues a a new job. All fields are written in order + * except for next_job accounting (TODO: Should we be clever and defer the + * upload of the header here until next job to keep the access pattern totally + * linear? Or is that just a micro op at this point?). Returns the generated + * index for dep management. + * + * Inject is used to inject a job at the front, for wallpapering. If you are + * not wallpapering and set this, dragons will eat you. */ + +unsigned +panfrost_new_job( + struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + enum mali_job_type type, + bool barrier, + unsigned local_dep, + void *payload, size_t payload_size, + bool inject) +{ + unsigned global_dep = 0; + + if (type == JOB_TYPE_TILER) { + /* Tiler jobs must be chained, and on Midgard, the first tiler + * job must depend on the write value job, whose index we + * reserve now */ + + if (scoreboard->tiler_dep) + global_dep = scoreboard->tiler_dep; + else if (!(pool->dev->quirks & IS_BIFROST)) { + scoreboard->write_value_index = ++scoreboard->job_index; + global_dep = scoreboard->write_value_index; + } + } + + /* Assign the index */ + unsigned index = ++scoreboard->job_index; + + struct mali_job_descriptor_header job = { + .job_descriptor_size = 1, + .job_type = type, + .job_barrier = barrier, + .job_index = index, + .job_dependency_index_1 = local_dep, + .job_dependency_index_2 = global_dep, + }; + + if (inject) + job.next_job = scoreboard->first_job; + + struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + payload_size); + memcpy(transfer.cpu, &job, sizeof(job)); + memcpy(transfer.cpu + sizeof(job), payload, payload_size); + + if (inject) { + scoreboard->first_job = transfer.gpu; + return index; + } + + /* Form a chain */ + if (type == JOB_TYPE_TILER) + scoreboard->tiler_dep = index; + + if (scoreboard->prev_job) + scoreboard->prev_job->next_job = transfer.gpu; + else + scoreboard->first_job = transfer.gpu; + + scoreboard->prev_job = (struct mali_job_descriptor_header *) transfer.cpu; + return index; +} + +/* Generates a write value job, used to initialize the tiler structures. Note + * this is called right before frame submission. */ + +void +panfrost_scoreboard_initialize_tiler(struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + mali_ptr polygon_list) +{ + /* Check if we even need tiling */ + if (pool->dev->quirks & IS_BIFROST || !scoreboard->tiler_dep) + return; + + /* Okay, we do. Let's generate it. We'll need the job's polygon list + * regardless of size. */ + + struct mali_job_descriptor_header job = { + .job_type = JOB_TYPE_WRITE_VALUE, + .job_index = scoreboard->write_value_index, + .job_descriptor_size = 1, + .next_job = scoreboard->first_job + }; + + struct mali_payload_write_value payload = { + .address = polygon_list, + .value_descriptor = MALI_WRITE_VALUE_ZERO, + }; + + struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + sizeof(payload)); + memcpy(transfer.cpu, &job, sizeof(job)); + memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload)); + + scoreboard->first_job = transfer.gpu; +} diff --git a/src/panfrost/lib/pan_scoreboard.h b/src/panfrost/lib/pan_scoreboard.h new file mode 100644 index 00000000000..71667d4b5de --- /dev/null +++ b/src/panfrost/lib/pan_scoreboard.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2019-2020 Collabora Ltd. + * Copyright (C) 2019 Alyssa Rosenzweig + * Copyright (C) 2014-2017 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PAN_SCOREBOARD_H__ +#define __PAN_SCOREBOARD_H__ + +#include "panfrost-job.h" +#include "pan_pool.h" + +struct pan_scoreboard { + /* The first job in the batch */ + mali_ptr first_job; + + /* The number of jobs in the primary batch, essentially */ + unsigned job_index; + + /* A CPU-side pointer to the previous job for next_job linking */ + struct mali_job_descriptor_header *prev_job; + + /* The dependency for tiler jobs (i.e. the index of the last emitted + * tiler job, or zero if none have been emitted) */ + unsigned tiler_dep; + + /* The job index of the WRITE_VALUE job (before it has been created) */ + unsigned write_value_index; +}; + +unsigned +panfrost_new_job( + struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + enum mali_job_type type, + bool barrier, + unsigned local_dep, + void *payload, size_t payload_size, + bool inject); + +void panfrost_scoreboard_initialize_tiler( + struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + mali_ptr polygon_list); + +#endif diff --git a/src/panfrost/lib/pan_scratch.c b/src/panfrost/lib/pan_scratch.c new file mode 100644 index 00000000000..478a788b116 --- /dev/null +++ b/src/panfrost/lib/pan_scratch.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include "util/u_math.h" +#include "pan_encoder.h" + +/* Midgard has a small register file, so shaders with high register pressure + * need to spill from the register file onto the stack. In addition to + * spilling, it is desireable to allocate temporary arrays on the stack (for + * instance because the register file does not support indirect access but the + * stack does). + * + * The stack is located in "Thread Local Storage", sometimes abbreviated TLS in + * the kernel source code. Thread local storage is allocated per-thread, + * per-core, so threads executing concurrently do not interfere with each + * other's stacks. On modern kernels, we may query + * DRM_PANFROST_PARAM_THREAD_TLS_ALLOC for the number of threads per core we + * must allocate for, and DRM_PANFROST_PARAM_SHADER_PRESENT for a bitmask of + * shader cores (so take a popcount of that mask for the number of shader + * cores). On older kernels that do not support querying these values, + * following kbase, we may use the worst-case value of 256 threads for + * THREAD_TLS_ALLOC, and the worst-case value of 16 cores for Midgard per the + * "shader core count" column of the implementations table in + * https://en.wikipedia.org/wiki/Mali_%28GPU% [citation needed] + * + * Within a particular thread, there is stack allocated. If it is present, its + * size is a power-of-two, and it is at least 16 bytes. Stack is allocated + * with the shared memory descriptor used for all shaders within a frame (note + * that they don't execute concurrently so it's fine). So, consider the maximum + * stack size used by any shader within a job, and then compute (where npot + * denotes the next power of two): + * + * bytes/thread = npot(max(size, 16)) + * allocated = (# of bytes/thread) * (# of threads/core) * (# of cores) + * + * The size of Thread Local Storage is signaled to the GPU in a dedicated + * log_stack_size field. Since stack sizes are powers of two, it follows that + * stack_size is logarithmic. Consider some sample values: + * + * stack size | log_stack_size + * --------------------------- + * 256 | 4 + * 512 | 5 + * 1024 | 6 + * + * Noting that log2(256) = 8, we have the relation: + * + * stack_size <= 2^(log_stack_size + 4) + * + * Given the constraints about powers-of-two and the minimum of 256, we thus + * derive a formula for log_stack_size in terms of stack size (s), where s is + * positive: + * + * log_stack_size = ceil(log2(max(s, 16))) - 4 + * + * There are other valid characterisations of this formula, of course, but this + * is computationally simple, so good enough for our purposes. If s=0, since + * there is no spilling used whatsoever, we may set log_stack_size to 0 to + * disable the stack. + */ + +/* Computes log_stack_size = ceil(log2(max(s, 16))) - 4 */ + +unsigned +panfrost_get_stack_shift(unsigned stack_size) +{ + if (stack_size) + return util_logbase2_ceil(MAX2(stack_size, 16)) - 4; + else + return 0; +} + +/* Computes the aligned stack size given the shift and thread count. The blob + * reserves an extra page, and since this is hardware-internal, we do too. */ + +unsigned +panfrost_get_total_stack_size( + unsigned stack_shift, + unsigned threads_per_core, + unsigned core_count) +{ + unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32); + unsigned size = size_per_thread * threads_per_core * core_count; + + return size + 4096; +} diff --git a/src/panfrost/lib/pan_texture.c b/src/panfrost/lib/pan_texture.c new file mode 100644 index 00000000000..da436ea7318 --- /dev/null +++ b/src/panfrost/lib/pan_texture.c @@ -0,0 +1,426 @@ +/* + * Copyright (C) 2008 VMware, Inc. + * Copyright (C) 2014 Broadcom + * Copyright (C) 2018-2019 Alyssa Rosenzweig + * Copyright (C) 2019-2020 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include "util/macros.h" +#include "util/u_math.h" +#include "pan_texture.h" + +/* Generates a texture descriptor. Ideally, descriptors are immutable after the + * texture is created, so we can keep these hanging around in GPU memory in a + * dedicated BO and not have to worry. In practice there are some minor gotchas + * with this (the driver sometimes will change the format of a texture on the + * fly for compression) but it's fast enough to just regenerate the descriptor + * in those cases, rather than monkeypatching at drawtime. + * + * A texture descriptor consists of a 32-byte mali_texture_descriptor structure + * followed by a variable number of pointers. Due to this variance and + * potentially large size, we actually upload directly rather than returning + * the descriptor. Whether the user does a copy themselves or not is irrelevant + * to us here. + */ + +/* Check if we need to set a custom stride by computing the "expected" + * stride and comparing it to what the user actually wants. Only applies + * to linear textures, since tiled/compressed textures have strict + * alignment requirements for their strides as it is */ + +static bool +panfrost_needs_explicit_stride( + struct panfrost_slice *slices, + uint16_t width, + unsigned first_level, unsigned last_level, + unsigned bytes_per_pixel) +{ + for (unsigned l = first_level; l <= last_level; ++l) { + unsigned actual = slices[l].stride; + unsigned expected = u_minify(width, l) * bytes_per_pixel; + + if (actual != expected) + return true; + } + + return false; +} + +/* A Scalable Texture Compression (ASTC) corresponds to just a few texture type + * in the hardware, but in fact can be parametrized to have various widths and + * heights for the so-called "stretch factor". It turns out these parameters + * are stuffed in the bottom bits of the payload pointers. This functions + * computes these magic stuffing constants based on the ASTC format in use. The + * constant in a given dimension is 3-bits, and two are stored side-by-side for + * each active dimension. + */ + +static unsigned +panfrost_astc_stretch(unsigned dim) +{ + assert(dim >= 4 && dim <= 12); + return MIN2(dim, 11) - 4; +} + +/* Texture addresses are tagged with information about compressed formats. + * AFBC uses a bit for whether the colorspace transform is enabled (RGB and + * RGBA only). + * For ASTC, this is a "stretch factor" encoding the block size. */ + +static unsigned +panfrost_compression_tag( + const struct util_format_description *desc, + enum mali_format format, enum mali_texture_layout layout) +{ + if (layout == MALI_TEXTURE_AFBC) + return desc->nr_channels >= 3; + else if (format == MALI_ASTC_2D_LDR || format == MALI_ASTC_2D_HDR) + return (panfrost_astc_stretch(desc->block.height) << 3) | + panfrost_astc_stretch(desc->block.width); + else + return 0; +} + + +/* Cubemaps have 6 faces as "layers" in between each actual layer. We + * need to fix this up. TODO: logic wrong in the asserted out cases ... + * can they happen, perhaps from cubemap arrays? */ + +static void +panfrost_adjust_cube_dimensions( + unsigned *first_face, unsigned *last_face, + unsigned *first_layer, unsigned *last_layer) +{ + *first_face = *first_layer % 6; + *last_face = *last_layer % 6; + *first_layer /= 6; + *last_layer /= 6; + + assert((*first_layer == *last_layer) || (*first_face == 0 && *last_face == 5)); +} + +/* Following the texture descriptor is a number of pointers. How many? */ + +static unsigned +panfrost_texture_num_elements( + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + bool is_cube, bool manual_stride) +{ + unsigned first_face = 0, last_face = 0; + + if (is_cube) { + panfrost_adjust_cube_dimensions(&first_face, &last_face, + &first_layer, &last_layer); + } + + unsigned levels = 1 + last_level - first_level; + unsigned layers = 1 + last_layer - first_layer; + unsigned faces = 1 + last_face - first_face; + unsigned num_elements = levels * layers * faces * MAX2(nr_samples, 1); + + if (manual_stride) + num_elements *= 2; + + return num_elements; +} + +/* Conservative estimate of the size of the texture payload a priori. + * Average case, size equal to the actual size. Worst case, off by 2x (if + * a manual stride is not needed on a linear texture). Returned value + * must be greater than or equal to the actual size, so it's safe to use + * as an allocation amount */ + +unsigned +panfrost_estimate_texture_payload_size( + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + enum mali_texture_type type, enum mali_texture_layout layout) +{ + /* Assume worst case */ + unsigned manual_stride = (layout == MALI_TEXTURE_LINEAR); + + unsigned elements = panfrost_texture_num_elements( + first_level, last_level, + first_layer, last_layer, + nr_samples, + type == MALI_TEX_CUBE, manual_stride); + + return sizeof(mali_ptr) * elements; +} + +/* Bifrost requires a tile stride for tiled textures. This stride is computed + * as (16 * bpp * width) assuming there is at least one tile (width >= 16). + * Otherwise if height <= 16, the blob puts zero. Interactions with AFBC are + * currently unknown. + */ + +static unsigned +panfrost_nonlinear_stride(enum mali_texture_layout layout, + unsigned bytes_per_pixel, + unsigned width, + unsigned height) +{ + if (layout == MALI_TEXTURE_TILED) { + return (height <= 16) ? 0 : (16 * bytes_per_pixel * ALIGN_POT(width, 16)); + } else { + unreachable("TODO: AFBC on Bifrost"); + } +} + +static void +panfrost_emit_texture_payload( + mali_ptr *payload, + const struct util_format_description *desc, + enum mali_format mali_format, + enum mali_texture_type type, + enum mali_texture_layout layout, + unsigned width, unsigned height, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + unsigned cube_stride, + bool manual_stride, + mali_ptr base, + struct panfrost_slice *slices) +{ + base |= panfrost_compression_tag(desc, mali_format, layout); + + /* Inject the addresses in, interleaving array indices, mip levels, + * cube faces, and strides in that order */ + + unsigned first_face = 0, last_face = 0, face_mult = 1; + + if (type == MALI_TEX_CUBE) { + face_mult = 6; + panfrost_adjust_cube_dimensions(&first_face, &last_face, &first_layer, &last_layer); + } + + nr_samples = MAX2(nr_samples, 1); + + unsigned idx = 0; + + for (unsigned w = first_layer; w <= last_layer; ++w) { + for (unsigned l = first_level; l <= last_level; ++l) { + for (unsigned f = first_face; f <= last_face; ++f) { + for (unsigned s = 0; s < nr_samples; ++s) { + payload[idx++] = base + panfrost_texture_offset( + slices, type == MALI_TEX_3D, + cube_stride, l, w * face_mult + f, s); + + if (manual_stride) { + payload[idx++] = (layout == MALI_TEXTURE_LINEAR) ? + slices[l].stride : + panfrost_nonlinear_stride(layout, + MAX2(desc->block.bits / 8, 1), + u_minify(width, l), + u_minify(height, l)); + } + } + } + } + } +} + +#define MALI_SWIZZLE_R001 \ + (MALI_CHANNEL_RED << 0) | \ + (MALI_CHANNEL_ZERO << 3) | \ + (MALI_CHANNEL_ZERO << 6) | \ + (MALI_CHANNEL_ONE << 9) + +#define MALI_SWIZZLE_A001 \ + (MALI_CHANNEL_ALPHA << 0) | \ + (MALI_CHANNEL_ZERO << 3) | \ + (MALI_CHANNEL_ZERO << 6) | \ + (MALI_CHANNEL_ONE << 9) + + +void +panfrost_new_texture( + void *out, + uint16_t width, uint16_t height, + uint16_t depth, uint16_t array_size, + enum pipe_format format, + enum mali_texture_type type, + enum mali_texture_layout layout, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + unsigned cube_stride, + unsigned swizzle, + mali_ptr base, + struct panfrost_slice *slices) +{ + const struct util_format_description *desc = + util_format_description(format); + + unsigned bytes_per_pixel = util_format_get_blocksize(format); + + enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw; + assert(mali_format); + + bool manual_stride = (layout == MALI_TEXTURE_LINEAR) + && panfrost_needs_explicit_stride(slices, width, + first_level, last_level, bytes_per_pixel); + + struct mali_texture_descriptor descriptor = { + .width = MALI_POSITIVE(u_minify(width, first_level)), + .height = MALI_POSITIVE(u_minify(height, first_level)), + .depth = MALI_POSITIVE(u_minify(depth, first_level)), + .array_size = MALI_POSITIVE(array_size), + .format = { + .swizzle = (format == PIPE_FORMAT_X24S8_UINT) ? + MALI_SWIZZLE_A001 : + (format == PIPE_FORMAT_S8_UINT) ? + MALI_SWIZZLE_R001 : + panfrost_translate_swizzle_4(desc->swizzle), + .format = mali_format, + .srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB), + .type = type, + .layout = layout, + .manual_stride = manual_stride, + .unknown2 = 1, + }, + .levels = last_level - first_level, + .swizzle = swizzle + }; + + memcpy(out, &descriptor, sizeof(descriptor)); + + mali_ptr *payload = (mali_ptr *) (out + sizeof(struct mali_texture_descriptor)); + panfrost_emit_texture_payload( + payload, + desc, + mali_format, + type, + layout, + width, height, + first_level, last_level, + first_layer, last_layer, + nr_samples, + cube_stride, + manual_stride, + base, + slices); +} + +void +panfrost_new_texture_bifrost( + struct bifrost_texture_descriptor *descriptor, + uint16_t width, uint16_t height, + uint16_t depth, uint16_t array_size, + enum pipe_format format, + enum mali_texture_type type, + enum mali_texture_layout layout, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + unsigned cube_stride, + unsigned swizzle, + mali_ptr base, + struct panfrost_slice *slices, + struct panfrost_bo *payload) +{ + const struct util_format_description *desc = + util_format_description(format); + + enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw; + assert(mali_format); + + panfrost_emit_texture_payload( + (mali_ptr *) payload->cpu, + desc, + mali_format, + type, + layout, + width, height, + first_level, last_level, + first_layer, last_layer, + nr_samples, + cube_stride, + true, /* Stride explicit on Bifrost */ + base, + slices); + + descriptor->format_unk = 0x2; + descriptor->type = type; + descriptor->format = mali_format; + descriptor->srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); + descriptor->format_unk3 = 0x0; + descriptor->width = MALI_POSITIVE(u_minify(width, first_level)); + descriptor->height = MALI_POSITIVE(u_minify(height, first_level)); + descriptor->swizzle = swizzle; + descriptor->layout = layout; + descriptor->levels = last_level - first_level; + descriptor->unk1 = 0x0; + descriptor->levels_unk = 0; + descriptor->level_2 = last_level - first_level; + descriptor->payload = payload->gpu; + descriptor->array_size = MALI_POSITIVE(array_size); + descriptor->unk4 = 0x0; + descriptor->depth = MALI_POSITIVE(u_minify(depth, first_level)); + descriptor->unk5 = 0x0; +} + +/* Computes sizes for checksumming, which is 8 bytes per 16x16 tile. + * Checksumming is believed to be a CRC variant (CRC64 based on the size?). + * This feature is also known as "transaction elimination". */ + +#define CHECKSUM_TILE_WIDTH 16 +#define CHECKSUM_TILE_HEIGHT 16 +#define CHECKSUM_BYTES_PER_TILE 8 + +unsigned +panfrost_compute_checksum_size( + struct panfrost_slice *slice, + unsigned width, + unsigned height) +{ + unsigned aligned_width = ALIGN_POT(width, CHECKSUM_TILE_WIDTH); + unsigned aligned_height = ALIGN_POT(height, CHECKSUM_TILE_HEIGHT); + + unsigned tile_count_x = aligned_width / CHECKSUM_TILE_WIDTH; + unsigned tile_count_y = aligned_height / CHECKSUM_TILE_HEIGHT; + + slice->checksum_stride = tile_count_x * CHECKSUM_BYTES_PER_TILE; + + return slice->checksum_stride * tile_count_y; +} + +unsigned +panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level) +{ + return is_3d ? slices[level].size0 : cube_stride; +} + +/* Computes the offset into a texture at a particular level/face. Add to + * the base address of a texture to get the address to that level/face */ + +unsigned +panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample) +{ + unsigned layer_stride = panfrost_get_layer_stride(slices, is_3d, cube_stride, level); + return slices[level].offset + (face * layer_stride) + (sample * slices[level].size0); +} diff --git a/src/panfrost/lib/pan_texture.h b/src/panfrost/lib/pan_texture.h new file mode 100644 index 00000000000..c4a07d15ad2 --- /dev/null +++ b/src/panfrost/lib/pan_texture.h @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2008 VMware, Inc. + * Copyright (C) 2014 Broadcom + * Copyright (C) 2018-2019 Alyssa Rosenzweig + * Copyright (C) 2019-2020 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __PAN_TEXTURE_H +#define __PAN_TEXTURE_H + +#include +#include "util/format/u_format.h" +#include "compiler/shader_enums.h" +#include "panfrost-job.h" +#include "pan_bo.h" + +struct panfrost_slice { + unsigned offset; + unsigned stride; + unsigned size0; + + /* If there is a header preceding each slice, how big is + * that header? Used for AFBC */ + unsigned header_size; + + /* If checksumming is enabled following the slice, what + * is its offset/stride? */ + unsigned checksum_offset; + unsigned checksum_stride; + struct panfrost_bo *checksum_bo; + + /* Has anything been written to this slice? */ + bool initialized; +}; + +struct pan_image { + /* Format and size */ + uint16_t width0, height0, depth0, array_size; + enum pipe_format format; + enum mali_texture_type type; + unsigned first_level, last_level; + unsigned first_layer, last_layer; + unsigned nr_samples; + struct panfrost_bo *bo; + struct panfrost_slice *slices; + unsigned cubemap_stride; + enum mali_texture_layout layout; +}; + +unsigned +panfrost_compute_checksum_size( + struct panfrost_slice *slice, + unsigned width, + unsigned height); + +/* AFBC */ + +bool +panfrost_format_supports_afbc(enum pipe_format format); + +unsigned +panfrost_afbc_header_size(unsigned width, unsigned height); + +/* mali_texture_descriptor */ + +unsigned +panfrost_estimate_texture_payload_size( + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + enum mali_texture_type type, enum mali_texture_layout layout); + +void +panfrost_new_texture( + void *out, + uint16_t width, uint16_t height, + uint16_t depth, uint16_t array_size, + enum pipe_format format, + enum mali_texture_type type, + enum mali_texture_layout layout, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + unsigned cube_stride, + unsigned swizzle, + mali_ptr base, + struct panfrost_slice *slices); + +void +panfrost_new_texture_bifrost( + struct bifrost_texture_descriptor *descriptor, + uint16_t width, uint16_t height, + uint16_t depth, uint16_t array_size, + enum pipe_format format, + enum mali_texture_type type, + enum mali_texture_layout layout, + unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned nr_samples, + unsigned cube_stride, + unsigned swizzle, + mali_ptr base, + struct panfrost_slice *slices, + struct panfrost_bo *payload); + + +unsigned +panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level); + +unsigned +panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample); + +/* Formats */ + +struct panfrost_format { + enum mali_format hw; + unsigned bind; +}; + +extern struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT]; + +bool +panfrost_is_z24s8_variant(enum pipe_format fmt); + +unsigned +panfrost_translate_swizzle_4(const unsigned char swizzle[4]); + +void +panfrost_invert_swizzle(const unsigned char *in, unsigned char *out); + +static inline unsigned +panfrost_get_default_swizzle(unsigned components) +{ + switch (components) { + case 1: + return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_ZERO << 3) | + (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9); + case 2: + return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | + (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9); + case 3: + return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | + (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ONE << 9); + case 4: + return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | + (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9); + default: + unreachable("Invalid number of components"); + } +} + +static inline unsigned +panfrost_bifrost_swizzle(unsigned components) +{ + /* Set all components to 0 and force w if needed */ + return components < 4 ? 0x10 : 0x00; +} + +enum mali_format +panfrost_format_to_bifrost_blend(const struct util_format_description *desc); + +struct pan_pool; +struct pan_scoreboard; + +void +panfrost_init_blit_shaders(struct panfrost_device *dev); + +void +panfrost_load_midg( + struct pan_pool *pool, + struct pan_scoreboard *scoreboard, + mali_ptr blend_shader, + mali_ptr fbd, + mali_ptr coordinates, unsigned vertex_count, + struct pan_image *image, + unsigned loc); + +#endif diff --git a/src/panfrost/lib/pan_tiler.c b/src/panfrost/lib/pan_tiler.c new file mode 100644 index 00000000000..fc42724a1e5 --- /dev/null +++ b/src/panfrost/lib/pan_tiler.c @@ -0,0 +1,373 @@ +/* + * Copyright (C) 2019 Collabora, Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Alyssa Rosenzweig + */ + +#include "util/u_math.h" +#include "util/macros.h" +#include "pan_encoder.h" + +/* Mali GPUs are tiled-mode renderers, rather than immediate-mode. + * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run. + * Then, a fixed-function hardware block (the tiler) consumes the gl_Position + * results. For each triangle specified, it marks each containing tile as + * containing that triangle. This set of "triangles per tile" form the "polygon + * list". Finally, the rasterization unit consumes the polygon list to invoke + * the fragment shader. + * + * In practice, it's a bit more complicated than this. On Midgard chips with an + * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical + * tile size, but Midgard features "hierarchical tiling", where power-of-two + * multiples of the base tile size can be used: hierarchy level 0 (16x16), + * level 1 (32x32), level 2 (64x64), per public information about Midgard's + * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice + * 128x128 is the largest usually used (though higher modes are enabled). The + * idea behind hierarchical tiling is to use low tiling levels for small + * triangles and high levels for large triangles, to minimize memory bandwidth + * and repeated fragment shader invocations (the former issue inherent to + * immediate-mode rendering and the latter common in traditional tilers). + * + * The tiler itself works by reading varyings in and writing a polygon list + * out. Unfortunately (for us), both of these buffers are managed in main + * memory; although they ideally will be cached, it is the drivers' + * responsibility to allocate these buffers. Varying buffer allocation is + * handled elsewhere, as it is not tiler specific; the real issue is allocating + * the polygon list. + * + * This is hard, because from the driver's perspective, we have no information + * about what geometry will actually look like on screen; that information is + * only gained from running the vertex shader. (Theoretically, we could run the + * vertex shaders in software as a prepass, or in hardware with transform + * feedback as a prepass, but either idea is ludicrous on so many levels). + * + * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list + * into three distinct pieces. First, the driver statically determines which + * tile hierarchy levels to use (more on that later). At this point, we know the + * framebuffer dimensions and all the possible tilings of the framebuffer, so + * we know exactly how many tiles exist across all hierarchy levels. The first + * piece of the polygon list is the header, which is exactly 8 bytes per tile, + * plus padding and a small 64-byte prologue. (If that doesn't remind you of + * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is + * the polygon list body, which seems to contain 512 bytes per tile, again + * across every level of the hierarchy. These two parts form the polygon list + * buffer. This buffer has a statically determinable size, approximately equal + * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus + * alignment / minimum restrictions / etc. + * + * The third piece is the easy one (for us): the tiler heap. In essence, the + * tiler heap is a gigantic slab that's as big as could possibly be necessary + * in the worst case imaginable. Just... a gigantic allocation that we give a + * start and end pointer to. What's the catch? The tiler heap is lazily + * allocated; that is, a huge amount of memory is _reserved_, but only a tiny + * bit is actually allocated upfront. The GPU just keeps using the + * unallocated-but-reserved portions as it goes along, generating page faults + * if it goes beyond the allocation, and then the kernel is instructed to + * expand the allocation on page fault (known in the vendor kernel as growable + * memory). This is quite a bit of bookkeeping of its own, but that task is + * pushed to kernel space and we can mostly ignore it here, just remembering to + * set the GROWABLE flag so the kernel actually uses this path rather than + * allocating a gigantic amount up front and burning a hole in RAM. + * + * As far as determining which hierarchy levels to use, the simple answer is + * that right now, we don't. In the tiler configuration fields (consistent from + * the earliest Midgard's SFBD through the latest Bifrost traces we have), + * there is a hierarchy_mask field, controlling which levels (tile sizes) are + * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to + * big tiles and small polygons to small tiles -- would be realized here as + * well. As long as there are polygons at all needing tiling, we always have to + * have big tiles available, in case there are big polygons. But we don't + * necessarily need small tiles available. Ideally, when there are small + * polygons, small tiles are enabled (to avoid waste from putting small + * triangles in the big tiles); when there are not, small tiles are disabled to + * avoid enabling more levels than necessary, which potentially costs in memory + * bandwidth / power / tiler performance. + * + * Of course, the driver has to figure this out statically. When tile + * hiearchies are actually established, this occurs by the tiler in + * fixed-function hardware, after the vertex shaders have run and there is + * sufficient information to figure out the size of triangles. The driver has + * no such luxury, again barring insane hacks like additionally running the + * vertex shaders in software or in hardware via transform feedback. Thus, for + * the driver, we need a heuristic approach. + * + * There are lots of heuristics to guess triangle size statically you could + * imagine, but one approach shines as particularly simple-stupid: assume all + * on-screen triangles are equal size and spread equidistantly throughout the + * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with + * it, then we see: + * + * Triangle Area = (Screen Area / # of triangles) + * = (Width * Height) / (# of triangles) + * + * Or if you prefer, we can also make a third CRAZY assumption that we only draw + * right triangles with edges parallel/perpendicular to the sides of the screen + * with no overdraw, forming a triangle grid across the screen: + * + * |--w--| + * _____ | + * | /| /| | + * |/_|/_| h + * | /| /| | + * |/_|/_| | + * + * Then you can use some middle school geometry and algebra to work out the + * triangle dimensions. I started working on this, but realised I didn't need + * to to make my point, but couldn't bare to erase that ASCII art. Anyway. + * + * POINT IS, by considering the ratio of screen area and triangle count, we can + * estimate the triangle size. For a small size, use small bins; for a large + * size, use large bins. Intuitively, this metric makes sense: when there are + * few triangles on a large screen, you're probably compositing a UI and + * therefore the triangles are large; when there are a lot of triangles on a + * small screen, you're probably rendering a 3D mesh and therefore the + * triangles are tiny. (Or better said -- there will be tiny triangles, even if + * there are also large triangles. There have to be unless you expect crazy + * overdraw. Generally, it's better to allow more small bin sizes than + * necessary than not allow enough.) + * + * From this heuristic (or whatever), we determine the minimum allowable tile + * size, and we use that to decide the hierarchy masking, selecting from the + * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice). + * + * Once we have that mask and the framebuffer dimensions, we can compute the + * size of the statically-sized polygon list structures, allocate them, and go! + * + * ----- + * + * On T720, T820, and T830, there is no support for hierarchical tiling. + * Instead, the hardware allows the driver to select the tile size dynamically + * on a per-framebuffer basis, including allowing rectangular/non-square tiles. + * Rules for tile size selection are as follows: + * + * - Dimensions must be powers-of-two. + * - The smallest tile is 16x16. + * - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix) + * - There must be no more than 64 tiles in either dimension. + * + * Within these constraints, the driver is free to pick a tile size according + * to some heuristic, similar to units with an advanced tiling unit. + * + * To pick a size without any heuristics, we may satisfy the constraints by + * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size + * constraint, consider: + * + * # of tiles < 64 + * ceil (fb / tile) < 64 + * (fb / tile) <= (64 - 1) + * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1)) + * + * Hence we clamp up to align_pot(fb / (64 - 1)). + + * Extending to use a selection heuristic left for future work. + * + * Once the tile size (w, h) is chosen, we compute the hierarchy "mask": + * + * hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16) + * + * Of course with no hierarchical tiling, this is not a mask; it's just a field + * specifying the tile size. But I digress. + * + * We also compute the polgon list sizes (with framebuffer size W, H) as: + * + * full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h) + * offset = 8 * ceil(W / w) * ceil(H / h) + * + * It further appears necessary to round down offset to the nearest 0x200. + * Possibly we would also round down full_size to the nearest 0x200 but + * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's + * nothing to do. + */ + +/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */ + +#define MIN_TILE_SIZE 16 +#define MAX_TILE_SIZE 4096 + +/* Constants as shifts for easier power-of-two iteration */ + +#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE) +#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE) + +/* The hierarchy has a 64-byte prologue */ +#define PROLOGUE_SIZE 0x40 + +/* For each tile (across all hierarchy levels), there is 8 bytes of header */ +#define HEADER_BYTES_PER_TILE 0x8 + +/* Likewise, each tile per level has 512 bytes of body */ +#define FULL_BYTES_PER_TILE 0x200 + +/* If the width-x-height framebuffer is divided into tile_size-x-tile_size + * tiles, how many tiles are there? Rounding up in each direction. For the + * special case of tile_size=16, this aligns with the usual Midgard count. + * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum, + * because those care about the stride (not just the overall count) and only at + * a a fixed-tile size (not any of a number of power-of-twos) */ + +static unsigned +pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height) +{ + unsigned aligned_width = ALIGN_POT(width, tile_width); + unsigned aligned_height = ALIGN_POT(height, tile_height); + + unsigned tile_count_x = aligned_width / tile_width; + unsigned tile_count_y = aligned_height / tile_height; + + return tile_count_x * tile_count_y; +} + +/* For `masked_count` of the smallest tile sizes masked out, computes how the + * size of the polygon list header. We iterate the tile sizes (16x16 through + * 2048x2048). For each tile size, we figure out how many tiles there are at + * this hierarchy level and therefore many bytes this level is, leaving us with + * a byte count for each level. We then just sum up the byte counts across the + * levels to find a byte count for all levels. */ + +static unsigned +panfrost_hierarchy_size( + unsigned width, + unsigned height, + unsigned mask, + unsigned bytes_per_tile) +{ + unsigned size = PROLOGUE_SIZE; + + /* Iterate hierarchy levels */ + + for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) { + /* Check if this level is enabled */ + if (!(mask & (1 << b))) + continue; + + /* Shift from a level to a tile size */ + unsigned tile_size = (1 << b) * MIN_TILE_SIZE; + + unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size); + unsigned level_count = bytes_per_tile * tile_count; + + size += level_count; + } + + /* This size will be used as an offset, so ensure it's aligned */ + return ALIGN_POT(size, 0x200); +} + +/* Implement the formula: + * + * 0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h) + * + * rounding down the answer to the nearest 0x200. This is used to compute both + * header and body sizes for GPUs without hierarchical tiling. Essentially, + * computing a single hierarchy level, since there isn't any hierarchy! + */ + +static unsigned +panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile) +{ + /* First, extract the tile dimensions */ + + unsigned tw = (1 << (dim & 0b111)) * 8; + unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8; + + /* tile_count is ceil(W/w) * ceil(H/h) */ + unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile; + + /* Round down and add offset */ + return 0x200 + ((raw / 0x200) * 0x200); +} + +/* Given a hierarchy mask and a framebuffer size, compute the header size */ + +unsigned +panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +{ + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE); +} + +/* The combined header/body is sized similarly (but it is significantly + * larger), except that it can be empty when the tiler disabled, rather than + * getting clamped to a minimum size. + */ + +unsigned +panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy) +{ + if (hierarchy) + return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE); + else + return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE); +} + +/* On GPUs without hierarchical tiling, we choose a tile size directly and + * stuff it into the field otherwise known as hierarchy mask (not a mask). */ + +static unsigned +panfrost_choose_tile_size( + unsigned width, unsigned height, unsigned vertex_count) +{ + /* Figure out the ideal tile size. Eventually a heuristic should be + * used for this */ + + unsigned best_w = 16; + unsigned best_h = 16; + + /* Clamp so there are less than 64 tiles in each direction */ + + best_w = MAX2(best_w, util_next_power_of_two(width / 63)); + best_h = MAX2(best_h, util_next_power_of_two(height / 63)); + + /* We have our ideal tile size, so encode */ + + unsigned exp_w = util_logbase2(best_w / 16); + unsigned exp_h = util_logbase2(best_h / 16); + + return exp_w | (exp_h << 6); +} + +/* In the future, a heuristic to choose a tiler hierarchy mask would go here. + * At the moment, we just default to 0xFF, which enables all possible hierarchy + * levels. Overall this yields good performance but presumably incurs a cost in + * memory bandwidth / power consumption / etc, at least on smaller scenes that + * don't really need all the smaller levels enabled */ + +unsigned +panfrost_choose_hierarchy_mask( + unsigned width, unsigned height, + unsigned vertex_count, bool hierarchy) +{ + /* If there is no geometry, we don't bother enabling anything */ + + if (!vertex_count) + return 0x00; + + if (!hierarchy) + return panfrost_choose_tile_size(width, height, vertex_count); + + /* Otherwise, default everything on. TODO: Proper tests */ + + return 0xFF; +} diff --git a/src/panfrost/lib/pan_util.h b/src/panfrost/lib/pan_util.h new file mode 100644 index 00000000000..5f40fc93633 --- /dev/null +++ b/src/panfrost/lib/pan_util.h @@ -0,0 +1,41 @@ +/************************************************************************** + * + * Copyright 2019 Collabora, Ltd. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef PAN_UTIL_H +#define PAN_UTIL_H + +#define PAN_DBG_MSGS 0x0001 +#define PAN_DBG_TRACE 0x0002 +#define PAN_DBG_DEQP 0x0004 +#define PAN_DBG_AFBC 0x0008 +#define PAN_DBG_SYNC 0x0010 +#define PAN_DBG_PRECOMPILE 0x0020 +#define PAN_DBG_NOFP16 0x0040 +#define PAN_DBG_BIFROST 0x0080 +#define PAN_DBG_GL3 0x0100 + +#endif /* PAN_UTIL_H */ diff --git a/src/panfrost/meson.build b/src/panfrost/meson.build index 80adf9d1cd5..8615bcaf7da 100644 --- a/src/panfrost/meson.build +++ b/src/panfrost/meson.build @@ -24,7 +24,7 @@ inc_panfrost_hw = include_directories([ ]) inc_panfrost = include_directories([ - '.', 'include', 'shared', 'midgard', 'bifrost', 'encoder' + '.', 'include', 'shared', 'midgard', 'bifrost', 'lib' ]) subdir('shared') @@ -32,7 +32,7 @@ subdir('util') subdir('midgard') subdir('bifrost') subdir('pandecode') -subdir('encoder') +subdir('lib') files_bifrost = files( 'bifrost/cmdline.c', @@ -63,7 +63,7 @@ bifrost_compiler = executable( libglsl_standalone, libpanfrost_bifrost, libpanfrost_decode, - libpanfrost_encoder, + libpanfrost_lib, libpanfrost_midgard, # references disassemble_midgard... ], build_by_default : with_tools.contains('panfrost')