driver_panfrost = declare_dependency(
compile_args : compile_args_panfrost,
- link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode, libpanfrost_encoder],
+ link_with : [libpanfrost, libpanfrostwinsys, libpanfrost_shared, libpanfrost_midgard, libpanfrost_bifrost, libpanfrost_decode, libpanfrost_lib],
)
#include "pan_blend_shaders.h"
#include "pan_cmdstream.h"
#include "pan_util.h"
-#include "pandecode/decode.h"
+#include "decode.h"
#include "util/pan_lower_framebuffer.h"
struct midgard_tiler_descriptor
#include "util/rounding.h"
#include "pan_util.h"
#include "pan_blending.h"
-#include "pandecode/decode.h"
+#include "decode.h"
#include "panfrost-quirks.h"
/* panfrost_bo_access is here to help us keep track of batch accesses to BOs
#include "pan_resource.h"
#include "pan_util.h"
#include "pan_tiling.h"
-#include "pandecode/decode.h"
+#include "decode.h"
#include "panfrost-quirks.h"
static struct pipe_resource *
#include "pan_resource.h"
#include "pan_public.h"
#include "pan_util.h"
-#include "pandecode/decode.h"
+#include "decode.h"
#include "pan_context.h"
#include "midgard/midgard_compile.h"
*/
#include "bit.h"
-#include "panfrost/pandecode/decode.h"
+#include "panfrost/lib/decode.h"
#include "drm-uapi/panfrost_drm.h"
#include "panfrost/encoder/pan_encoder.h"
+++ /dev/null
-# Copyright © 2018 Rob Clark
-# Copyright © 2019 Collabora
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-libpanfrost_encoder_files = files(
- 'pan_encoder.h',
-
- 'pan_afbc.c',
- 'pan_attributes.c',
- 'pan_bo.c',
- 'pan_blit.c',
- 'pan_format.c',
- 'pan_invocation.c',
- 'pan_sampler.c',
- 'pan_tiler.c',
- 'pan_texture.c',
- 'pan_scoreboard.c',
- 'pan_scratch.c',
- 'pan_pool.c',
- 'pan_props.c',
-)
-
-libpanfrost_encoder = static_library(
- 'panfrost_encoder',
- [libpanfrost_encoder_files],
- include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw],
- c_args : [no_override_init_args],
- gnu_symbol_visibility : 'hidden',
- dependencies: [dep_libdrm, idep_nir],
- build_by_default : false,
-)
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include "pan_texture.h"
-
-/* Arm FrameBuffer Compression (AFBC) is a lossless compression scheme natively
- * implemented in Mali GPUs (as well as many display controllers paired with
- * Mali GPUs, etc). Where possible, Panfrost prefers to use AFBC for both
- * rendering and texturing. In most cases, this is a performance-win due to a
- * dramatic reduction in memory bandwidth and cache locality compared to a
- * linear resources.
- *
- * AFBC divides the framebuffer into 16x16 tiles (other sizes possible, TODO:
- * do we need to support this?). So, the width and height each must be aligned
- * up to 16 pixels. This is inherently good for performance; note that for a 4
- * byte-per-pixel format like RGBA8888, that means that rows are 16*4=64 byte
- * aligned, which is the cache-line size.
- *
- * For each AFBC-compressed resource, there is a single contiguous
- * (CPU/GPU-shared) buffer. This buffer itself is divided into two parts:
- * header and body, placed immediately after each other.
- *
- * The AFBC header contains 16 bytes of metadata per tile.
- *
- * The AFBC body is the same size as the original linear resource (padded to
- * the nearest tile). Although the body comes immediately after the header, it
- * must also be cache-line aligned, so there can sometimes be a bit of padding
- * between the header and body.
- *
- * As an example, a 64x64 RGBA framebuffer contains 64/16 = 4 tiles horizontally and
- * 4 tiles vertically. There are 4*4=16 tiles in total, each containing 16
- * bytes of metadata, so there is a 16*16=256 byte header. 64x64 is already
- * tile aligned, so the body is 64*64 * 4 bytes per pixel = 16384 bytes of
- * body.
- *
- * From userspace, Panfrost needs to be able to calculate these sizes. It
- * explicitly does not and can not know the format of the data contained within
- * this header and body. The GPU has native support for AFBC encode/decode. For
- * an internal FBO or a framebuffer used for scanout with an AFBC-compatible
- * winsys/display-controller, the buffer is maintained AFBC throughout flight,
- * and the driver never needs to know the internal data. For edge cases where
- * the driver really does need to read/write from the AFBC resource, we
- * generate a linear staging buffer and use the GPU to blit AFBC<--->linear.
- * TODO: Implement me. */
-
-#define AFBC_TILE_WIDTH 16
-#define AFBC_TILE_HEIGHT 16
-#define AFBC_HEADER_BYTES_PER_TILE 16
-#define AFBC_CACHE_ALIGN 64
-
-/* Is it possible to AFBC compress a particular format? Common formats (and
- * YUV) are compressible. Some obscure formats are not and fallback on linear,
- * at a performance hit. Also, if you need to disable AFBC entirely in the
- * driver for debug/profiling, just always return false here. */
-
-bool
-panfrost_format_supports_afbc(enum pipe_format format)
-{
- const struct util_format_description *desc =
- util_format_description(format);
-
- /* sRGB cannot be AFBC, but it can be tiled. TODO: Verify. The blob
- * does not do AFBC for SRGB8_ALPHA8, but it's not clear why it
- * shouldn't be able to. */
-
- if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
- return false;
-
- if (util_format_is_rgba8_variant(desc))
- return true;
-
- /* Only Z24S8 variants are compressible as Z/S */
-
- if (panfrost_is_z24s8_variant(format))
- return true;
-
- /* Lookup special formats */
- switch (format) {
- case PIPE_FORMAT_R8G8B8_UNORM:
- case PIPE_FORMAT_B8G8R8_UNORM:
- case PIPE_FORMAT_R5G6B5_UNORM:
- case PIPE_FORMAT_B5G6R5_UNORM:
- return true;
- default:
- return false;
- }
-}
-
-unsigned
-panfrost_afbc_header_size(unsigned width, unsigned height)
-{
- /* Align to tile */
- unsigned aligned_width = ALIGN_POT(width, AFBC_TILE_WIDTH);
- unsigned aligned_height = ALIGN_POT(height, AFBC_TILE_HEIGHT);
-
- /* Compute size in tiles, rather than pixels */
- unsigned tile_count_x = aligned_width / AFBC_TILE_WIDTH;
- unsigned tile_count_y = aligned_height / AFBC_TILE_HEIGHT;
- unsigned tile_count = tile_count_x * tile_count_y;
-
- /* Multiply to find the header size */
- unsigned header_bytes = tile_count * AFBC_HEADER_BYTES_PER_TILE;
-
- /* Align and go */
- return ALIGN_POT(header_bytes, AFBC_CACHE_ALIGN);
-
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include "util/u_math.h"
-#include "panfrost-job.h"
-#include "pan_encoder.h"
-
-/* This file handles attribute descriptors (mali_attr_meta). The
- * bulk of the complexity is from instancing. See mali_job for
- * notes on how this works. But basically, for small vertex
- * counts, we have a lookup table, and for large vertex counts,
- * we look at the high bits as a heuristic. This has to match
- * exactly how the hardware calculates this (which is why the
- * algorithm is so weird) or else instancing will break. */
-
-/* Given an odd number (of the form 2k + 1), compute k */
-#define ODD(odd) ((odd - 1) >> 1)
-
-static unsigned
-panfrost_small_padded_vertex_count(unsigned idx)
-{
- if (idx == 11 || idx == 13 || idx == 15 || idx == 19)
- return idx + 1;
- else
- return idx;
-}
-
-static unsigned
-panfrost_large_padded_vertex_count(uint32_t vertex_count)
-{
- /* First, we have to find the highest set one */
- unsigned highest = 32 - __builtin_clz(vertex_count);
-
- /* Using that, we mask out the highest 4-bits */
- unsigned n = highest - 4;
- unsigned nibble = (vertex_count >> n) & 0xF;
-
- /* Great, we have the nibble. Now we can just try possibilities. Note
- * that we don't care about the bottom most bit in most cases, and we
- * know the top bit must be 1 */
-
- unsigned middle_two = (nibble >> 1) & 0x3;
-
- switch (middle_two) {
- case 0b00:
- if (!(nibble & 1))
- return (1 << n) * 9;
- else
- return (1 << (n + 1)) * 5;
- case 0b01:
- return (1 << (n + 2)) * 3;
- case 0b10:
- return (1 << (n + 1)) * 7;
- case 0b11:
- return (1 << (n + 4));
- default:
- return 0; /* unreachable */
- }
-}
-
-unsigned
-panfrost_padded_vertex_count(unsigned vertex_count)
-{
- if (vertex_count < 20)
- return panfrost_small_padded_vertex_count(vertex_count);
- else
- return panfrost_large_padded_vertex_count(vertex_count);
-}
-
-/* The much, much more irritating case -- instancing is enabled. See
- * panfrost_job.h for notes on how this works */
-
-static unsigned
-panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags)
-{
- /* We have a NPOT divisor. Here's the fun one (multipling by
- * the inverse and shifting) */
-
- /* floor(log2(d)) */
- unsigned shift = util_logbase2(hw_divisor);
-
- /* m = ceil(2^(32 + shift) / d) */
- uint64_t shift_hi = 32 + shift;
- uint64_t t = 1ll << shift_hi;
- double t_f = t;
- double hw_divisor_d = hw_divisor;
- double m_f = ceil(t_f / hw_divisor_d);
- unsigned m = m_f;
-
- /* Default case */
- uint32_t magic_divisor = m;
-
- /* e = 2^(shift + 32) % d */
- uint64_t e = t % hw_divisor;
-
- /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
- * seems to use a different condition */
- if (e <= (1ll << shift)) {
- magic_divisor = m - 1;
- *extra_flags = 1;
- }
-
- /* Top flag implicitly set */
- assert(magic_divisor & (1u << 31));
- magic_divisor &= ~(1u << 31);
- *o_shift = shift;
-
- return magic_divisor;
-}
-
-unsigned
-panfrost_vertex_instanced(
- unsigned padded_count,
- unsigned instance_shift, unsigned instance_odd,
- unsigned divisor,
- union mali_attr *attrs)
-{
- /* Depending if there is an instance divisor or not, packing varies.
- * When there is a divisor, the hardware-level divisor is actually the
- * product of the instance divisor and the padded count */
-
- unsigned hw_divisor = padded_count * divisor;
-
- if (divisor == 0) {
- /* Per-vertex attributes use the MODULO mode. First, compute
- * the modulus */
-
- attrs->elements |= MALI_ATTR_MODULO;
- attrs->shift = instance_shift;
- attrs->extra_flags = instance_odd;
-
- return 1;
- } else if (util_is_power_of_two_or_zero(hw_divisor)) {
- /* If there is a divisor but the hardware divisor works out to
- * a power of two (not terribly exceptional), we can use an
- * easy path (just shifting) */
-
- attrs->elements |= MALI_ATTR_POT_DIVIDE;
- attrs->shift = __builtin_ctz(hw_divisor);
-
- return 1;
- } else {
- unsigned shift = 0, extra_flags = 0;
-
- attrs[1].magic_divisor =
- panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
-
- /* Upload to two different slots */
-
- attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
- attrs[0].shift = shift;
- attrs[0].extra_flags = extra_flags;
-
- attrs[1].unk = 0x20;
- attrs[1].zero = 0;
- attrs[1].divisor = divisor;
-
- return 2;
- }
-}
-
-/* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
- * but the idea is the same */
-
-void
-panfrost_vertex_id(
- unsigned padded_count,
- union mali_attr *attr)
-{
- /* We factor the padded count as shift/odd and that's it */
-
- attr->elements = MALI_ATTR_VERTEXID;
- attr->shift = __builtin_ctz(padded_count);
- attr->extra_flags = padded_count >> (attr->shift + 1);
- attr->stride = attr->size = 0;
-}
-
-void
-panfrost_instance_id(
- unsigned padded_count,
- union mali_attr *attr)
-{
- attr->elements = MALI_ATTR_INSTANCEID;
- attr->stride = 0;
- attr->extra_flags = 0;
- attr->size = 0;
-
- /* POT records have just a shift directly with an off-by-one for
- * unclear reasons. NPOT records have a magic divisor smushed into the
- * stride field (which is unused for these special records) */
-
- if (util_is_power_of_two_or_zero(padded_count)) {
- attr->shift = __builtin_ctz(padded_count) - 1;
- } else {
- unsigned shift = 0, flags = 0;
-
- attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags);
- attr->shift = shift;
- attr->extra_flags = flags;
- }
-}
-
+++ /dev/null
-/*
- * Copyright (C) 2020 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include <math.h>
-#include <stdio.h>
-#include "pan_encoder.h"
-#include "pan_pool.h"
-#include "pan_scoreboard.h"
-#include "pan_texture.h"
-#include "panfrost-quirks.h"
-#include "../midgard/midgard_compile.h"
-#include "compiler/nir/nir_builder.h"
-#include "util/u_math.h"
-
-/* On Midgard, the native blit infrastructure (via MFBD preloads) is broken or
- * missing in many cases. We instead use software paths as fallbacks to
- * implement blits, which are done as TILER jobs. No vertex shader is
- * necessary since we can supply screen-space coordinates directly.
- *
- * This is primarily designed as a fallback for preloads but could be extended
- * for other clears/blits if needed in the future. */
-
-static void
-panfrost_build_blit_shader(panfrost_program *program, unsigned gpu_id, gl_frag_result loc, nir_alu_type T, bool ms)
-{
- bool is_colour = loc >= FRAG_RESULT_DATA0;
-
- nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_FRAGMENT, &midgard_nir_options, NULL);
- nir_function *fn = nir_function_create(shader, "main");
- nir_function_impl *impl = nir_function_impl_create(fn);
-
- nir_variable *c_src = nir_variable_create(shader, nir_var_shader_in, glsl_vector_type(GLSL_TYPE_FLOAT, 2), "coord");
- nir_variable *c_out = nir_variable_create(shader, nir_var_shader_out, glsl_vector_type(
- GLSL_TYPE_FLOAT, is_colour ? 4 : 1), "out");
-
- c_src->data.location = VARYING_SLOT_TEX0;
- c_out->data.location = loc;
-
- nir_builder _b;
- nir_builder *b = &_b;
- nir_builder_init(b, impl);
- b->cursor = nir_before_block(nir_start_block(impl));
-
- nir_ssa_def *coord = nir_load_var(b, c_src);
-
- nir_tex_instr *tex = nir_tex_instr_create(shader, ms ? 3 : 1);
-
- tex->dest_type = T;
-
- if (ms) {
- tex->src[0].src_type = nir_tex_src_coord;
- tex->src[0].src = nir_src_for_ssa(nir_f2i32(b, coord));
- tex->coord_components = 2;
-
- tex->src[1].src_type = nir_tex_src_ms_index;
- tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
-
- tex->src[2].src_type = nir_tex_src_lod;
- tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0));
- tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
- tex->op = nir_texop_txf_ms;
- } else {
- tex->op = nir_texop_tex;
-
- tex->src[0].src_type = nir_tex_src_coord;
- tex->src[0].src = nir_src_for_ssa(coord);
- tex->coord_components = 2;
-
- tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
- }
-
- nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
- nir_builder_instr_insert(b, &tex->instr);
-
- if (is_colour)
- nir_store_var(b, c_out, &tex->dest.ssa, 0xFF);
- else
- nir_store_var(b, c_out, nir_channel(b, &tex->dest.ssa, 0), 0xFF);
-
- midgard_compile_shader_nir(shader, program, false, 0, gpu_id, false, true);
-}
-
-/* Compile and upload all possible blit shaders ahead-of-time to reduce draw
- * time overhead. There's only ~30 of them at the moment, so this is fine */
-
-void
-panfrost_init_blit_shaders(struct panfrost_device *dev)
-{
- static const struct {
- gl_frag_result loc;
- unsigned types;
- } shader_descs[] = {
- { FRAG_RESULT_DEPTH, 1 << PAN_BLIT_FLOAT },
- { FRAG_RESULT_STENCIL, 1 << PAN_BLIT_UINT },
- { FRAG_RESULT_DATA0, ~0 },
- { FRAG_RESULT_DATA1, ~0 },
- { FRAG_RESULT_DATA2, ~0 },
- { FRAG_RESULT_DATA3, ~0 },
- { FRAG_RESULT_DATA4, ~0 },
- { FRAG_RESULT_DATA5, ~0 },
- { FRAG_RESULT_DATA6, ~0 },
- { FRAG_RESULT_DATA7, ~0 }
- };
-
- nir_alu_type nir_types[PAN_BLIT_NUM_TYPES] = {
- nir_type_float,
- nir_type_uint,
- nir_type_int
- };
-
- /* Total size = # of shaders * bytes per shader. There are
- * shaders for each RT (so up to DATA7 -- overestimate is
- * okay) and up to NUM_TYPES variants of each, * 2 for multisampling
- * variants. These shaders are simple enough that they should be less
- * than 8 quadwords each (again, overestimate is fine). */
-
- unsigned offset = 0;
- unsigned total_size = (FRAG_RESULT_DATA7 * PAN_BLIT_NUM_TYPES)
- * (8 * 16) * 2;
-
- dev->blit_shaders.bo = panfrost_bo_create(dev, total_size, PAN_BO_EXECUTE);
-
- /* Don't bother generating multisampling variants if we don't actually
- * support multisampling */
- bool has_ms = !(dev->quirks & MIDGARD_SFBD);
-
- for (unsigned ms = 0; ms <= has_ms; ++ms) {
- for (unsigned i = 0; i < ARRAY_SIZE(shader_descs); ++i) {
- unsigned loc = shader_descs[i].loc;
-
- for (enum pan_blit_type T = 0; T < PAN_BLIT_NUM_TYPES; ++T) {
- if (!(shader_descs[i].types & (1 << T)))
- continue;
-
- panfrost_program program;
- panfrost_build_blit_shader(&program, dev->gpu_id, loc,
- nir_types[T], ms);
-
- assert(offset + program.compiled.size < total_size);
- memcpy(dev->blit_shaders.bo->cpu + offset, program.compiled.data, program.compiled.size);
-
- dev->blit_shaders.loads[loc][T][ms] = (dev->blit_shaders.bo->gpu + offset) | program.first_tag;
- offset += ALIGN_POT(program.compiled.size, 64);
- util_dynarray_fini(&program.compiled);
- }
- }
- }
-}
-
-/* Add a shader-based load on Midgard (draw-time for GL). Shaders are
- * precached */
-
-void
-panfrost_load_midg(
- struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- mali_ptr blend_shader,
- mali_ptr fbd,
- mali_ptr coordinates, unsigned vertex_count,
- struct pan_image *image,
- unsigned loc)
-{
- unsigned width = u_minify(image->width0, image->first_level);
- unsigned height = u_minify(image->height0, image->first_level);
-
- struct mali_viewport viewport = {
- .clip_minx = -INFINITY,
- .clip_miny = -INFINITY,
- .clip_maxx = INFINITY,
- .clip_maxy = INFINITY,
- .clip_minz = 0.0,
- .clip_maxz = 1.0,
-
- .viewport0 = { 0, 0 },
- .viewport1 = { MALI_POSITIVE(width), MALI_POSITIVE(height) }
- };
-
- union mali_attr varying = {
- .elements = coordinates | MALI_ATTR_LINEAR,
- .stride = 4 * sizeof(float),
- .size = 4 * sizeof(float) * vertex_count,
- };
-
- struct mali_attr_meta varying_meta = {
- .index = 0,
- .unknown1 = 2,
- .swizzle = (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3),
- .format = MALI_RGBA32F
- };
-
- struct mali_stencil_test stencil = {
- .mask = 0xFF,
- .func = MALI_FUNC_ALWAYS,
- .sfail = MALI_STENCIL_REPLACE,
- .dpfail = MALI_STENCIL_REPLACE,
- .dppass = MALI_STENCIL_REPLACE,
- };
-
- union midgard_blend replace = {
- .equation = {
- .rgb_mode = 0x122,
- .alpha_mode = 0x122,
- .color_mask = MALI_MASK_R | MALI_MASK_G | MALI_MASK_B | MALI_MASK_A,
- }
- };
-
- if (blend_shader)
- replace.shader = blend_shader;
-
- /* Determine the sampler type needed. Stencil is always sampled as
- * UINT. Pure (U)INT is always (U)INT. Everything else is FLOAT. */
-
- enum pan_blit_type T =
- (loc == FRAG_RESULT_STENCIL) ? PAN_BLIT_UINT :
- (util_format_is_pure_uint(image->format)) ? PAN_BLIT_UINT :
- (util_format_is_pure_sint(image->format)) ? PAN_BLIT_INT :
- PAN_BLIT_FLOAT;
-
- bool ms = image->nr_samples > 1;
-
- struct mali_shader_meta shader_meta = {
- .shader = pool->dev->blit_shaders.loads[loc][T][ms],
- .sampler_count = 1,
- .texture_count = 1,
- .varying_count = 1,
- .midgard1 = {
- .flags_lo = 0x20,
- .work_count = 4,
- },
- .coverage_mask = 0xF,
- .unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10,
- .unknown2_4 = 0x4e0,
- .stencil_mask_front = ~0,
- .stencil_mask_back = ~0,
- .stencil_front = stencil,
- .stencil_back = stencil,
- .blend = {
- .shader = blend_shader
- }
- };
-
- if (ms)
- shader_meta.unknown2_3 |= MALI_HAS_MSAA | MALI_PER_SAMPLE;
- else
- shader_meta.unknown2_4 |= MALI_NO_MSAA;
-
- assert(shader_meta.shader);
-
- if (pool->dev->quirks & MIDGARD_SFBD) {
- shader_meta.unknown2_4 |= (0x10 | MALI_NO_DITHER);
- shader_meta.blend = replace;
-
- if (loc < FRAG_RESULT_DATA0)
- shader_meta.blend.equation.color_mask = 0x0;
- }
-
- if (loc == FRAG_RESULT_DEPTH) {
- shader_meta.midgard1.flags_lo |= MALI_WRITES_Z;
- shader_meta.unknown2_3 |= MALI_DEPTH_WRITEMASK;
- } else if (loc == FRAG_RESULT_STENCIL) {
- shader_meta.midgard1.flags_hi |= MALI_WRITES_S;
- shader_meta.unknown2_4 |= MALI_STENCIL_TEST;
- } else {
- shader_meta.midgard1.flags_lo |= MALI_EARLY_Z;
- }
-
- /* Create the texture descriptor. We partially compute the base address
- * ourselves to account for layer, such that the texture descriptor
- * itself is for a 2D texture with array size 1 even for 3D/array
- * textures, removing the need to separately key the blit shaders for
- * 2D and 3D variants */
-
- struct panfrost_transfer texture_t = panfrost_pool_alloc(pool, sizeof(struct mali_texture_descriptor) + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1));
-
- panfrost_new_texture(texture_t.cpu,
- image->width0, image->height0,
- MAX2(image->nr_samples, 1), 1,
- image->format, MALI_TEX_2D,
- image->layout,
- image->first_level, image->last_level,
- 0, 0,
- image->nr_samples,
- 0,
- (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9),
- image->bo->gpu + image->first_layer *
- panfrost_get_layer_stride(image->slices,
- image->type == MALI_TEX_3D,
- image->cubemap_stride, image->first_level),
- image->slices);
-
- struct mali_sampler_descriptor sampler = {
- .filter_mode = MALI_SAMP_MAG_NEAREST | MALI_SAMP_MIN_NEAREST,
- .wrap_s = MALI_WRAP_CLAMP_TO_EDGE,
- .wrap_t = MALI_WRAP_CLAMP_TO_EDGE,
- .wrap_r = MALI_WRAP_CLAMP_TO_EDGE,
- };
-
- struct panfrost_transfer shader_meta_t = panfrost_pool_alloc(pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt));
- memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta));
-
- for (unsigned i = 0; i < 8; ++i) {
- void *dest = shader_meta_t.cpu + sizeof(shader_meta) + sizeof(struct midgard_blend_rt) * i;
-
- if (loc == (FRAG_RESULT_DATA0 + i)) {
- struct midgard_blend_rt blend_rt = {
- .flags = 0x200 | MALI_BLEND_NO_DITHER,
- .blend = replace,
- };
-
- if (util_format_is_srgb(image->format))
- blend_rt.flags |= MALI_BLEND_SRGB;
-
- if (blend_shader) {
- blend_rt.flags |= MALI_BLEND_MRT_SHADER;
- blend_rt.blend.shader = blend_shader;
- }
-
- memcpy(dest, &blend_rt, sizeof(struct midgard_blend_rt));
- } else {
- memset(dest, 0x0, sizeof(struct midgard_blend_rt));
- }
- }
-
- struct midgard_payload_vertex_tiler payload = {
- .prefix = {
- .draw_mode = MALI_TRIANGLES,
- .unknown_draw = 0x3000,
- .index_count = MALI_POSITIVE(vertex_count)
- },
- .postfix = {
- .gl_enables = 0x7,
- .position_varying = coordinates,
- .textures = panfrost_pool_upload(pool, &texture_t.gpu, sizeof(texture_t.gpu)),
- .sampler_descriptor = panfrost_pool_upload(pool, &sampler, sizeof(sampler)),
- .shader = shader_meta_t.gpu,
- .varyings = panfrost_pool_upload(pool, &varying, sizeof(varying)),
- .varying_meta = panfrost_pool_upload(pool, &varying_meta, sizeof(varying_meta)),
- .viewport = panfrost_pool_upload(pool, &viewport, sizeof(viewport)),
- .shared_memory = fbd
- }
- };
-
- panfrost_pack_work_groups_compute(&payload.prefix, 1, vertex_count, 1, 1, 1, 1, true);
- payload.prefix.workgroups_x_shift_3 = 6;
-
- panfrost_new_job(pool, scoreboard, JOB_TYPE_TILER, false, 0, &payload, sizeof(payload), true);
-}
+++ /dev/null
-/*
- * Copyright 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors (Collabora):
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-#include <errno.h>
-#include <stdio.h>
-#include <fcntl.h>
-#include <xf86drm.h>
-#include <pthread.h>
-#include "drm-uapi/panfrost_drm.h"
-
-#include "pan_bo.h"
-#include "pan_util.h"
-#include "../pandecode/public.h"
-
-#include "os/os_mman.h"
-
-#include "util/u_inlines.h"
-#include "util/u_math.h"
-
-/* This file implements a userspace BO cache. Allocating and freeing
- * GPU-visible buffers is very expensive, and even the extra kernel roundtrips
- * adds more work than we would like at this point. So caching BOs in userspace
- * solves both of these problems and does not require kernel updates.
- *
- * Cached BOs are sorted into a bucket based on rounding their size down to the
- * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo
- * objects. Putting a BO into the cache is accomplished by adding it to the
- * corresponding bucket. Getting a BO from the cache consists of finding the
- * appropriate bucket and sorting. A cache eviction is a kernel-level free of a
- * BO and removing it from the bucket. We special case evicting all BOs from
- * the cache, since that's what helpful in practice and avoids extra logic
- * around the linked list.
- */
-
-static struct panfrost_bo *
-panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
- uint32_t flags)
-{
- struct drm_panfrost_create_bo create_bo = { .size = size };
- struct panfrost_bo *bo;
- int ret;
-
- if (dev->kernel_version->version_major > 1 ||
- dev->kernel_version->version_minor >= 1) {
- if (flags & PAN_BO_GROWABLE)
- create_bo.flags |= PANFROST_BO_HEAP;
- if (!(flags & PAN_BO_EXECUTE))
- create_bo.flags |= PANFROST_BO_NOEXEC;
- }
-
- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
- if (ret) {
- fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n");
- return NULL;
- }
-
- bo = pan_lookup_bo(dev, create_bo.handle);
- assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo)));
-
- bo->size = create_bo.size;
- bo->gpu = create_bo.offset;
- bo->gem_handle = create_bo.handle;
- bo->flags = flags;
- bo->dev = dev;
- return bo;
-}
-
-static void
-panfrost_bo_free(struct panfrost_bo *bo)
-{
- struct drm_gem_close gem_close = { .handle = bo->gem_handle };
- int ret;
-
- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
- if (ret) {
- fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n");
- assert(0);
- }
-
- /* BO will be freed with the sparse array, but zero to indicate free */
- memset(bo, 0, sizeof(*bo));
-}
-
-/* Returns true if the BO is ready, false otherwise.
- * access_type is encoding the type of access one wants to ensure is done.
- * Waiting is always done for writers, but if wait_readers is set then readers
- * are also waited for.
- */
-bool
-panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
-{
- struct drm_panfrost_wait_bo req = {
- .handle = bo->gem_handle,
- .timeout_ns = timeout_ns,
- };
- int ret;
-
- /* If the BO has been exported or imported we can't rely on the cached
- * state, we need to call the WAIT_BO ioctl.
- */
- if (!(bo->flags & PAN_BO_SHARED)) {
- /* If ->gpu_access is 0, the BO is idle, no need to wait. */
- if (!bo->gpu_access)
- return true;
-
- /* If the caller only wants to wait for writers and no
- * writes are pending, we don't have to wait.
- */
- if (!wait_readers && !(bo->gpu_access & PAN_BO_ACCESS_WRITE))
- return true;
- }
-
- /* The ioctl returns >= 0 value when the BO we are waiting for is ready
- * -1 otherwise.
- */
- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
- if (ret != -1) {
- /* Set gpu_access to 0 so that the next call to bo_wait()
- * doesn't have to call the WAIT_BO ioctl.
- */
- bo->gpu_access = 0;
- return true;
- }
-
- /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed
- * is invalid, which shouldn't happen here.
- */
- assert(errno == ETIMEDOUT || errno == EBUSY);
- return false;
-}
-
-/* Helper to calculate the bucket index of a BO */
-
-static unsigned
-pan_bucket_index(unsigned size)
-{
- /* Round down to POT to compute a bucket index */
-
- unsigned bucket_index = util_logbase2(size);
-
- /* Clamp the bucket index; all huge allocations will be
- * sorted into the largest bucket */
-
- bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET);
-
- /* The minimum bucket size must equal the minimum allocation
- * size; the maximum we clamped */
-
- assert(bucket_index >= MIN_BO_CACHE_BUCKET);
- assert(bucket_index <= MAX_BO_CACHE_BUCKET);
-
- /* Reindex from 0 */
- return (bucket_index - MIN_BO_CACHE_BUCKET);
-}
-
-static struct list_head *
-pan_bucket(struct panfrost_device *dev, unsigned size)
-{
- return &dev->bo_cache.buckets[pan_bucket_index(size)];
-}
-
-/* Tries to fetch a BO of sufficient size with the appropriate flags from the
- * BO cache. If it succeeds, it returns that BO and removes the BO from the
- * cache. If it fails, it returns NULL signaling the caller to allocate a new
- * BO. */
-
-static struct panfrost_bo *
-panfrost_bo_cache_fetch(struct panfrost_device *dev,
- size_t size, uint32_t flags, bool dontwait)
-{
- pthread_mutex_lock(&dev->bo_cache.lock);
- struct list_head *bucket = pan_bucket(dev, size);
- struct panfrost_bo *bo = NULL;
-
- /* Iterate the bucket looking for something suitable */
- list_for_each_entry_safe(struct panfrost_bo, entry, bucket,
- bucket_link) {
- if (entry->size < size || entry->flags != flags)
- continue;
-
- if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
- PAN_BO_ACCESS_RW))
- continue;
-
- struct drm_panfrost_madvise madv = {
- .handle = entry->gem_handle,
- .madv = PANFROST_MADV_WILLNEED,
- };
- int ret;
-
- /* This one works, splice it out of the cache */
- list_del(&entry->bucket_link);
- list_del(&entry->lru_link);
-
- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
- if (!ret && !madv.retained) {
- panfrost_bo_free(entry);
- continue;
- }
- /* Let's go! */
- bo = entry;
- break;
- }
- pthread_mutex_unlock(&dev->bo_cache.lock);
-
- return bo;
-}
-
-static void
-panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev)
-{
- struct timespec time;
-
- clock_gettime(CLOCK_MONOTONIC, &time);
- list_for_each_entry_safe(struct panfrost_bo, entry,
- &dev->bo_cache.lru, lru_link) {
- /* We want all entries that have been used more than 1 sec
- * ago to be dropped, others can be kept.
- * Note the <= 2 check and not <= 1. It's here to account for
- * the fact that we're only testing ->tv_sec, not ->tv_nsec.
- * That means we might keep entries that are between 1 and 2
- * seconds old, but we don't really care, as long as unused BOs
- * are dropped at some point.
- */
- if (time.tv_sec - entry->last_used <= 2)
- break;
-
- list_del(&entry->bucket_link);
- list_del(&entry->lru_link);
- panfrost_bo_free(entry);
- }
-}
-
-/* Tries to add a BO to the cache. Returns if it was
- * successful */
-
-static bool
-panfrost_bo_cache_put(struct panfrost_bo *bo)
-{
- struct panfrost_device *dev = bo->dev;
-
- if (bo->flags & PAN_BO_SHARED)
- return false;
-
- pthread_mutex_lock(&dev->bo_cache.lock);
- struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096));
- struct drm_panfrost_madvise madv;
- struct timespec time;
-
- madv.handle = bo->gem_handle;
- madv.madv = PANFROST_MADV_DONTNEED;
- madv.retained = 0;
-
- drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
-
- /* Add us to the bucket */
- list_addtail(&bo->bucket_link, bucket);
-
- /* Add us to the LRU list and update the last_used field. */
- list_addtail(&bo->lru_link, &dev->bo_cache.lru);
- clock_gettime(CLOCK_MONOTONIC, &time);
- bo->last_used = time.tv_sec;
-
- /* Let's do some cleanup in the BO cache while we hold the
- * lock.
- */
- panfrost_bo_cache_evict_stale_bos(dev);
- pthread_mutex_unlock(&dev->bo_cache.lock);
-
- return true;
-}
-
-/* Evicts all BOs from the cache. Called during context
- * destroy or during low-memory situations (to free up
- * memory that may be unused by us just sitting in our
- * cache, but still reserved from the perspective of the
- * OS) */
-
-void
-panfrost_bo_cache_evict_all(
- struct panfrost_device *dev)
-{
- pthread_mutex_lock(&dev->bo_cache.lock);
- for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) {
- struct list_head *bucket = &dev->bo_cache.buckets[i];
-
- list_for_each_entry_safe(struct panfrost_bo, entry, bucket,
- bucket_link) {
- list_del(&entry->bucket_link);
- list_del(&entry->lru_link);
- panfrost_bo_free(entry);
- }
- }
- pthread_mutex_unlock(&dev->bo_cache.lock);
-}
-
-void
-panfrost_bo_mmap(struct panfrost_bo *bo)
-{
- struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle };
- int ret;
-
- if (bo->cpu)
- return;
-
- ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo);
- if (ret) {
- fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n");
- assert(0);
- }
-
- bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
- bo->dev->fd, mmap_bo.offset);
- if (bo->cpu == MAP_FAILED) {
- fprintf(stderr, "mmap failed: %p %m\n", bo->cpu);
- assert(0);
- }
-}
-
-static void
-panfrost_bo_munmap(struct panfrost_bo *bo)
-{
- if (!bo->cpu)
- return;
-
- if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) {
- perror("munmap");
- abort();
- }
-
- bo->cpu = NULL;
-}
-
-struct panfrost_bo *
-panfrost_bo_create(struct panfrost_device *dev, size_t size,
- uint32_t flags)
-{
- struct panfrost_bo *bo;
-
- /* Kernel will fail (confusingly) with EPERM otherwise */
- assert(size > 0);
-
- /* To maximize BO cache usage, don't allocate tiny BOs */
- size = MAX2(size, 4096);
-
- /* GROWABLE BOs cannot be mmapped */
- if (flags & PAN_BO_GROWABLE)
- assert(flags & PAN_BO_INVISIBLE);
-
- /* Before creating a BO, we first want to check the cache but without
- * waiting for BO readiness (BOs in the cache can still be referenced
- * by jobs that are not finished yet).
- * If the cached allocation fails we fall back on fresh BO allocation,
- * and if that fails too, we try one more time to allocate from the
- * cache, but this time we accept to wait.
- */
- bo = panfrost_bo_cache_fetch(dev, size, flags, true);
- if (!bo)
- bo = panfrost_bo_alloc(dev, size, flags);
- if (!bo)
- bo = panfrost_bo_cache_fetch(dev, size, flags, false);
-
- if (!bo)
- fprintf(stderr, "BO creation failed\n");
-
- assert(bo);
-
- /* Only mmap now if we know we need to. For CPU-invisible buffers, we
- * never map since we don't care about their contents; they're purely
- * for GPU-internal use. But we do trace them anyway. */
-
- if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP)))
- panfrost_bo_mmap(bo);
-
- p_atomic_set(&bo->refcnt, 1);
-
- if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
- if (flags & PAN_BO_INVISIBLE)
- pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL);
- else if (!(flags & PAN_BO_DELAY_MMAP))
- pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL);
- }
-
- return bo;
-}
-
-void
-panfrost_bo_reference(struct panfrost_bo *bo)
-{
- if (bo) {
- ASSERTED int count = p_atomic_inc_return(&bo->refcnt);
- assert(count != 1);
- }
-}
-
-void
-panfrost_bo_unreference(struct panfrost_bo *bo)
-{
- if (!bo)
- return;
-
- /* Don't return to cache if there are still references */
- if (p_atomic_dec_return(&bo->refcnt))
- return;
-
- struct panfrost_device *dev = bo->dev;
-
- pthread_mutex_lock(&dev->bo_map_lock);
-
- /* Someone might have imported this BO while we were waiting for the
- * lock, let's make sure it's still not referenced before freeing it.
- */
- if (p_atomic_read(&bo->refcnt) == 0) {
- /* When the reference count goes to zero, we need to cleanup */
- panfrost_bo_munmap(bo);
-
- /* Rather than freeing the BO now, we'll cache the BO for later
- * allocations if we're allowed to.
- */
- if (!panfrost_bo_cache_put(bo))
- panfrost_bo_free(bo);
-
- }
- pthread_mutex_unlock(&dev->bo_map_lock);
-}
-
-struct panfrost_bo *
-panfrost_bo_import(struct panfrost_device *dev, int fd)
-{
- struct panfrost_bo *bo;
- struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
- ASSERTED int ret;
- unsigned gem_handle;
-
- ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
- assert(!ret);
-
- pthread_mutex_lock(&dev->bo_map_lock);
- bo = pan_lookup_bo(dev, gem_handle);
-
- if (!bo->dev) {
- get_bo_offset.handle = gem_handle;
- ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
- assert(!ret);
-
- bo->dev = dev;
- bo->gpu = (mali_ptr) get_bo_offset.offset;
- bo->size = lseek(fd, 0, SEEK_END);
- bo->flags = PAN_BO_SHARED;
- bo->gem_handle = gem_handle;
- assert(bo->size > 0);
- p_atomic_set(&bo->refcnt, 1);
- // TODO map and unmap on demand?
- panfrost_bo_mmap(bo);
- } else {
- /* bo->refcnt == 0 can happen if the BO
- * was being released but panfrost_bo_import() acquired the
- * lock before panfrost_bo_unreference(). In that case, refcnt
- * is 0 and we can't use panfrost_bo_reference() directly, we
- * have to re-initialize the refcnt().
- * Note that panfrost_bo_unreference() checks
- * refcnt value just after acquiring the lock to
- * make sure the object is not freed if panfrost_bo_import()
- * acquired it in the meantime.
- */
- if (p_atomic_read(&bo->refcnt) == 0)
- p_atomic_set(&bo->refcnt, 1);
- else
- panfrost_bo_reference(bo);
- assert(bo->cpu);
- }
- pthread_mutex_unlock(&dev->bo_map_lock);
-
- return bo;
-}
-
-int
-panfrost_bo_export(struct panfrost_bo *bo)
-{
- struct drm_prime_handle args = {
- .handle = bo->gem_handle,
- .flags = DRM_CLOEXEC,
- };
-
- int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
- if (ret == -1)
- return -1;
-
- bo->flags |= PAN_BO_SHARED;
- return args.fd;
-}
-
+++ /dev/null
-/*
- * © Copyright 2019 Alyssa Rosenzweig
- * © Copyright 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PAN_BO_H__
-#define __PAN_BO_H__
-
-#include "util/list.h"
-#include "pan_device.h"
-#include <time.h>
-
-/* Flags for allocated memory */
-
-/* This memory region is executable */
-#define PAN_BO_EXECUTE (1 << 0)
-
-/* This memory region should be lazily allocated and grow-on-page-fault. Must
- * be used in conjunction with INVISIBLE */
-#define PAN_BO_GROWABLE (1 << 1)
-
-/* This memory region should not be mapped to the CPU */
-#define PAN_BO_INVISIBLE (1 << 2)
-
-/* This region may not be used immediately and will not mmap on allocate
- * (semantically distinct from INVISIBLE, which cannot never be mmaped) */
-#define PAN_BO_DELAY_MMAP (1 << 3)
-
-/* BO is shared across processes (imported or exported) and therefore cannot be
- * cached locally */
-#define PAN_BO_SHARED (1 << 4)
-
-/* GPU access flags */
-
-/* BO is either shared (can be accessed by more than one GPU batch) or private
- * (reserved by a specific GPU job). */
-#define PAN_BO_ACCESS_PRIVATE (0 << 0)
-#define PAN_BO_ACCESS_SHARED (1 << 0)
-
-/* BO is being read/written by the GPU */
-#define PAN_BO_ACCESS_READ (1 << 1)
-#define PAN_BO_ACCESS_WRITE (1 << 2)
-#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE)
-
-/* BO is accessed by the vertex/tiler job. */
-#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3)
-
-/* BO is accessed by the fragment job. */
-#define PAN_BO_ACCESS_FRAGMENT (1 << 4)
-
-struct panfrost_bo {
- /* Must be first for casting */
- struct list_head bucket_link;
-
- /* Used to link the BO to the BO cache LRU list. */
- struct list_head lru_link;
-
- /* Store the time this BO was use last, so the BO cache logic can evict
- * stale BOs.
- */
- time_t last_used;
-
- /* Atomic reference count */
- int32_t refcnt;
-
- struct panfrost_device *dev;
-
- /* Mapping for the entire object (all levels) */
- uint8_t *cpu;
-
- /* GPU address for the object */
- mali_ptr gpu;
-
- /* Size of all entire trees */
- size_t size;
-
- int gem_handle;
-
- uint32_t flags;
-
- /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending
- * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl
- * when the BO is idle.
- */
- uint32_t gpu_access;
-};
-
-bool
-panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers);
-void
-panfrost_bo_reference(struct panfrost_bo *bo);
-void
-panfrost_bo_unreference(struct panfrost_bo *bo);
-struct panfrost_bo *
-panfrost_bo_create(struct panfrost_device *dev, size_t size,
- uint32_t flags);
-void
-panfrost_bo_mmap(struct panfrost_bo *bo);
-struct panfrost_bo *
-panfrost_bo_import(struct panfrost_device *dev, int fd);
-int
-panfrost_bo_export(struct panfrost_bo *bo);
-void
-panfrost_bo_cache_evict_all(struct panfrost_device *dev);
-
-#endif /* __PAN_BO_H__ */
+++ /dev/null
-/**************************************************************************
- *
- * Copyright 2018-2019 Alyssa Rosenzweig
- * Copyright 2018-2019 Collabora, Ltd.
- * Copyright © 2015 Intel Corporation
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef PAN_DEVICE_H
-#define PAN_DEVICE_H
-
-#include <xf86drm.h>
-#include "renderonly/renderonly.h"
-#include "util/u_dynarray.h"
-#include "util/bitset.h"
-#include "util/list.h"
-#include "util/sparse_array.h"
-
-#include <panfrost-job.h>
-
-/* Driver limits */
-#define PAN_MAX_CONST_BUFFERS 16
-
-/* Transient slab size. This is a balance between fragmentation against cache
- * locality and ease of bookkeeping */
-
-#define TRANSIENT_SLAB_PAGES (32) /* 128kb */
-#define TRANSIENT_SLAB_SIZE (4096 * TRANSIENT_SLAB_PAGES)
-
-/* Maximum number of transient slabs so we don't need dynamic arrays. Most
- * interesting Mali boards are 4GB RAM max, so if the entire RAM was filled
- * with transient slabs, you could never exceed (4GB / TRANSIENT_SLAB_SIZE)
- * allocations anyway. By capping, we can use a fixed-size bitset for tracking
- * free slabs, eliminating quite a bit of complexity. We can pack the free
- * state of 8 slabs into a single byte, so for 128kb transient slabs the bitset
- * occupies a cheap 4kb of memory */
-
-#define MAX_TRANSIENT_SLABS (1024*1024 / TRANSIENT_SLAB_PAGES)
-
-/* How many power-of-two levels in the BO cache do we want? 2^12
- * minimum chosen as it is the page size that all allocations are
- * rounded to */
-
-#define MIN_BO_CACHE_BUCKET (12) /* 2^12 = 4KB */
-#define MAX_BO_CACHE_BUCKET (22) /* 2^22 = 4MB */
-
-/* Fencepost problem, hence the off-by-one */
-#define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1)
-
-/* Cache for blit shaders. Defined here so they can be cached with the device */
-
-enum pan_blit_type {
- PAN_BLIT_FLOAT = 0,
- PAN_BLIT_UINT,
- PAN_BLIT_INT,
- PAN_BLIT_NUM_TYPES,
-};
-
-#define PAN_BLIT_NUM_TARGETS (12)
-
-struct pan_blit_shaders {
- struct panfrost_bo *bo;
- mali_ptr loads[PAN_BLIT_NUM_TARGETS][PAN_BLIT_NUM_TYPES][2];
-};
-
-struct panfrost_device {
- /* For ralloc */
- void *memctx;
-
- int fd;
-
- /* Properties of the GPU in use */
- unsigned gpu_id;
- unsigned core_count;
- unsigned thread_tls_alloc;
- unsigned quirks;
-
- /* Bitmask of supported compressed texture formats */
- uint32_t compressed_formats;
-
- /* debug flags, see pan_util.h how to interpret */
- unsigned debug;
-
- drmVersionPtr kernel_version;
-
- struct renderonly *ro;
-
- pthread_mutex_t bo_map_lock;
- struct util_sparse_array bo_map;
-
- struct {
- pthread_mutex_t lock;
-
- /* List containing all cached BOs sorted in LRU (Least
- * Recently Used) order. This allows us to quickly evict BOs
- * that are more than 1 second old.
- */
- struct list_head lru;
-
- /* The BO cache is a set of buckets with power-of-two sizes
- * ranging from 2^12 (4096, the page size) to
- * 2^(12 + MAX_BO_CACHE_BUCKETS).
- * Each bucket is a linked list of free panfrost_bo objects. */
-
- struct list_head buckets[NR_BO_CACHE_BUCKETS];
- } bo_cache;
-
- struct pan_blit_shaders blit_shaders;
-};
-
-void
-panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev);
-
-void
-panfrost_close_device(struct panfrost_device *dev);
-
-bool
-panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt);
-
-static inline struct panfrost_bo *
-pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle)
-{
- return util_sparse_array_get(&dev->bo_map, gem_handle);
-}
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors (Collabora):
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#ifndef __PAN_ENCODER_H
-#define __PAN_ENCODER_H
-
-#include <stdbool.h>
-#include "panfrost-job.h"
-
-/* Invocation packing */
-
-void
-panfrost_pack_work_groups_compute(
- struct mali_vertex_tiler_prefix *out,
- unsigned num_x,
- unsigned num_y,
- unsigned num_z,
- unsigned size_x,
- unsigned size_y,
- unsigned size_z,
- bool quirk_graphics);
-
-void
-panfrost_pack_work_groups_fused(
- struct mali_vertex_tiler_prefix *vertex,
- struct mali_vertex_tiler_prefix *tiler,
- unsigned num_x,
- unsigned num_y,
- unsigned num_z,
- unsigned size_x,
- unsigned size_y,
- unsigned size_z);
-
-/* Tiler structure size computation */
-
-unsigned
-panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy);
-
-unsigned
-panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy);
-
-unsigned
-panfrost_choose_hierarchy_mask(
- unsigned width, unsigned height,
- unsigned vertex_count, bool hierarchy);
-
-/* Stack sizes */
-
-unsigned
-panfrost_get_stack_shift(unsigned stack_size);
-
-unsigned
-panfrost_get_total_stack_size(
- unsigned stack_shift,
- unsigned threads_per_core,
- unsigned core_count);
-
-/* Property queries */
-
-
-unsigned panfrost_query_gpu_version(int fd);
-unsigned panfrost_query_core_count(int fd);
-unsigned panfrost_query_thread_tls_alloc(int fd);
-
-const char * panfrost_model_name(unsigned gpu_id);
-
-/* Attributes / instancing */
-
-unsigned
-panfrost_padded_vertex_count(unsigned vertex_count);
-
-unsigned
-panfrost_vertex_instanced(
- unsigned padded_count,
- unsigned instance_shift, unsigned instance_odd,
- unsigned divisor,
- union mali_attr *attrs);
-
-void panfrost_vertex_id(unsigned padded_count, union mali_attr *attr);
-void panfrost_instance_id(unsigned padded_count, union mali_attr *attr);
-
-/* Samplers */
-
-enum mali_func
-panfrost_flip_compare_func(enum mali_func f);
-
-
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include <stdio.h>
-#include "panfrost-job.h"
-#include "pan_texture.h"
-
-/* Convenience */
-
-#define _V PIPE_BIND_VERTEX_BUFFER
-#define _T PIPE_BIND_SAMPLER_VIEW
-#define _R PIPE_BIND_RENDER_TARGET
-#define _Z PIPE_BIND_DEPTH_STENCIL
-#define _VT (_V | _T)
-#define _VTR (_V | _T | _R)
-#define _TZ (_T | _Z)
-
-struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT] = {
- [PIPE_FORMAT_ETC1_RGB8] = { MALI_ETC2_RGB8, _T },
- [PIPE_FORMAT_ETC2_RGB8] = { MALI_ETC2_RGB8, _T },
- [PIPE_FORMAT_ETC2_SRGB8] = { MALI_ETC2_RGB8, _T },
- [PIPE_FORMAT_ETC2_R11_UNORM] = { MALI_ETC2_R11_UNORM, _T },
- [PIPE_FORMAT_ETC2_RGBA8] = { MALI_ETC2_RGBA8, _T },
- [PIPE_FORMAT_ETC2_SRGBA8] = { MALI_ETC2_RGBA8, _T },
- [PIPE_FORMAT_ETC2_RG11_UNORM] = { MALI_ETC2_RG11_UNORM, _T },
- [PIPE_FORMAT_ETC2_R11_SNORM] = { MALI_ETC2_R11_SNORM, _T },
- [PIPE_FORMAT_ETC2_RG11_SNORM] = { MALI_ETC2_RG11_SNORM, _T },
- [PIPE_FORMAT_ETC2_RGB8A1] = { MALI_ETC2_RGB8A1, _T },
- [PIPE_FORMAT_ETC2_SRGB8A1] = { MALI_ETC2_RGB8A1, _T },
-
- [PIPE_FORMAT_DXT1_RGB] = { MALI_BC1_UNORM, _T },
- [PIPE_FORMAT_DXT1_RGBA] = { MALI_BC1_UNORM, _T },
- [PIPE_FORMAT_DXT1_SRGB] = { MALI_BC1_UNORM, _T },
- [PIPE_FORMAT_DXT1_SRGBA] = { MALI_BC1_UNORM, _T },
- [PIPE_FORMAT_DXT3_RGBA] = { MALI_BC2_UNORM, _T },
- [PIPE_FORMAT_DXT3_SRGBA] = { MALI_BC2_UNORM, _T },
- [PIPE_FORMAT_DXT5_RGBA] = { MALI_BC3_UNORM, _T },
- [PIPE_FORMAT_DXT5_SRGBA] = { MALI_BC3_UNORM, _T },
-
- [PIPE_FORMAT_RGTC1_UNORM] = { MALI_BC4_UNORM, _T },
- [PIPE_FORMAT_RGTC1_SNORM] = { MALI_BC4_SNORM, _T },
- [PIPE_FORMAT_RGTC2_UNORM] = { MALI_BC5_UNORM, _T },
- [PIPE_FORMAT_RGTC2_SNORM] = { MALI_BC5_SNORM, _T },
-
- [PIPE_FORMAT_BPTC_RGB_FLOAT] = { MALI_BC6H_SF16, _T },
- [PIPE_FORMAT_BPTC_RGB_UFLOAT] = { MALI_BC6H_UF16, _T },
- [PIPE_FORMAT_BPTC_RGBA_UNORM] = { MALI_BC7_UNORM, _T },
- [PIPE_FORMAT_BPTC_SRGBA] = { MALI_BC7_UNORM, _T },
-
- [PIPE_FORMAT_ASTC_4x4] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_5x4] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_5x5] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_6x5] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_6x6] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_8x5] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_8x6] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_8x8] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_10x5] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_10x6] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_10x8] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_10x10] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_12x10] = { MALI_ASTC_2D_HDR, _T },
- [PIPE_FORMAT_ASTC_12x12] = { MALI_ASTC_2D_HDR, _T },
-
- [PIPE_FORMAT_ASTC_4x4_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_5x4_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_5x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_6x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_6x6_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_8x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_8x6_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_8x8_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_10x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_10x6_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_10x8_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_10x10_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_12x10_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_ASTC_12x12_SRGB] = { MALI_ASTC_2D_LDR, _T },
- [PIPE_FORMAT_B5G6R5_UNORM] = { MALI_RGB565, _VTR },
- [PIPE_FORMAT_B5G5R5X1_UNORM] = { MALI_RGB5_X1_UNORM, _VT },
- [PIPE_FORMAT_R5G5B5A1_UNORM] = { MALI_RGB5_A1_UNORM, _VTR },
-
- [PIPE_FORMAT_R10G10B10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
- [PIPE_FORMAT_B10G10R10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
- [PIPE_FORMAT_R10G10B10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
- [PIPE_FORMAT_B10G10R10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
- [PIPE_FORMAT_R10G10B10X2_SNORM] = { MALI_RGB10_A2_SNORM, _VT },
- [PIPE_FORMAT_R10G10B10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT },
- [PIPE_FORMAT_B10G10R10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT },
- [PIPE_FORMAT_R10G10B10A2_UINT] = { MALI_RGB10_A2UI, _VTR },
- [PIPE_FORMAT_B10G10R10A2_UINT] = { MALI_RGB10_A2UI, _VTR },
- [PIPE_FORMAT_R10G10B10A2_USCALED] = { MALI_RGB10_A2UI, _VTR },
- [PIPE_FORMAT_B10G10R10A2_USCALED] = { MALI_RGB10_A2UI, _VTR },
- [PIPE_FORMAT_R10G10B10A2_SINT] = { MALI_RGB10_A2I, _VTR},
- [PIPE_FORMAT_B10G10R10A2_SINT] = { MALI_RGB10_A2I, _VTR },
- [PIPE_FORMAT_R10G10B10A2_SSCALED] = { MALI_RGB10_A2I, _VTR },
- [PIPE_FORMAT_B10G10R10A2_SSCALED] = { MALI_RGB10_A2I, _VTR },
-
- [PIPE_FORMAT_R8_SSCALED] = { MALI_R8I, _V },
- [PIPE_FORMAT_R8G8_SSCALED] = { MALI_RG8I, _V },
- [PIPE_FORMAT_R8G8B8_SSCALED] = { MALI_RGB8I, _V },
- [PIPE_FORMAT_B8G8R8_SSCALED] = { MALI_RGB8I, _V },
- [PIPE_FORMAT_R8G8B8A8_SSCALED] = { MALI_RGBA8I, _V },
- [PIPE_FORMAT_B8G8R8A8_SSCALED] = { MALI_RGBA8I, _V },
- [PIPE_FORMAT_A8B8G8R8_SSCALED] = { MALI_RGBA8I, _V },
-
- [PIPE_FORMAT_R8_USCALED] = { MALI_R8UI, _V },
- [PIPE_FORMAT_R8G8_USCALED] = { MALI_RG8UI, _V },
- [PIPE_FORMAT_R8G8B8_USCALED] = { MALI_RGB8UI, _V },
- [PIPE_FORMAT_B8G8R8_USCALED] = { MALI_RGB8UI, _V },
- [PIPE_FORMAT_R8G8B8A8_USCALED] = { MALI_RGBA8UI, _V },
- [PIPE_FORMAT_B8G8R8A8_USCALED] = { MALI_RGBA8UI, _V },
- [PIPE_FORMAT_A8B8G8R8_USCALED] = { MALI_RGBA8UI, _V },
-
- [PIPE_FORMAT_R16_USCALED] = { MALI_R16UI, _V },
- [PIPE_FORMAT_R16G16_USCALED] = { MALI_RG16UI, _V },
- [PIPE_FORMAT_R16G16B16_USCALED] = { MALI_RGB16UI, _V },
- [PIPE_FORMAT_R16G16B16A16_USCALED] = { MALI_RGBA16UI, _V },
- [PIPE_FORMAT_R16_SSCALED] = { MALI_R16I, _V },
- [PIPE_FORMAT_R16G16_SSCALED] = { MALI_RG16I, _V },
- [PIPE_FORMAT_R16G16B16_SSCALED] = { MALI_RGB16I, _V },
- [PIPE_FORMAT_R16G16B16A16_SSCALED] = { MALI_RGBA16I, _V },
-
- [PIPE_FORMAT_R32_USCALED] = { MALI_R32UI, _V },
- [PIPE_FORMAT_R32G32_USCALED] = { MALI_RG32UI, _V },
- [PIPE_FORMAT_R32G32B32_USCALED] = { MALI_RGB32UI, _V },
- [PIPE_FORMAT_R32G32B32A32_USCALED] = { MALI_RGBA32UI, _V },
- [PIPE_FORMAT_R32_SSCALED] = { MALI_R32I, _V },
- [PIPE_FORMAT_R32G32_SSCALED] = { MALI_RG32I, _V },
- [PIPE_FORMAT_R32G32B32_SSCALED] = { MALI_RGB32I, _V },
- [PIPE_FORMAT_R32G32B32A32_SSCALED] = { MALI_RGBA32I, _V },
-
- [PIPE_FORMAT_R3G3B2_UNORM] = { MALI_RGB332_UNORM, _VT },
-
- [PIPE_FORMAT_Z24_UNORM_S8_UINT] = { MALI_Z24X8_UNORM, _TZ },
- [PIPE_FORMAT_Z24X8_UNORM] = { MALI_Z24X8_UNORM, _TZ },
- [PIPE_FORMAT_Z32_FLOAT] = { MALI_R32F, _TZ },
- [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = { MALI_R32F, _TZ },
- [PIPE_FORMAT_X32_S8X24_UINT] = { MALI_R8UI, _T },
- [PIPE_FORMAT_X24S8_UINT] = { MALI_RGBA8UI, _TZ },
- [PIPE_FORMAT_S8_UINT] = { MALI_R8UI, _T },
-
- [PIPE_FORMAT_R32_FIXED] = { MALI_R32_FIXED, _V },
- [PIPE_FORMAT_R32G32_FIXED] = { MALI_RG32_FIXED, _V },
- [PIPE_FORMAT_R32G32B32_FIXED] = { MALI_RGB32_FIXED, _V },
- [PIPE_FORMAT_R32G32B32A32_FIXED] = { MALI_RGBA32_FIXED, _V },
-
- [PIPE_FORMAT_R11G11B10_FLOAT] = { MALI_R11F_G11F_B10F, _VTR},
- [PIPE_FORMAT_R9G9B9E5_FLOAT] = { MALI_R9F_G9F_B9F_E5F, _VT },
-
- [PIPE_FORMAT_R8_SNORM] = { MALI_R8_SNORM, _VT },
- [PIPE_FORMAT_R16_SNORM] = { MALI_R16_SNORM, _VT },
- [PIPE_FORMAT_R32_SNORM] = { MALI_R32_SNORM, _VT },
- [PIPE_FORMAT_R8G8_SNORM] = { MALI_RG8_SNORM, _VT },
- [PIPE_FORMAT_R16G16_SNORM] = { MALI_RG16_SNORM, _VT },
- [PIPE_FORMAT_R32G32_SNORM] = { MALI_RG32_SNORM, _VT },
- [PIPE_FORMAT_R8G8B8_SNORM] = { MALI_RGB8_SNORM, _VT },
- [PIPE_FORMAT_R16G16B16_SNORM] = { MALI_RGB16_SNORM, _VT },
- [PIPE_FORMAT_R32G32B32_SNORM] = { MALI_RGB32_SNORM, _VT },
- [PIPE_FORMAT_R8G8B8A8_SNORM] = { MALI_RGBA8_SNORM, _VT },
- [PIPE_FORMAT_R16G16B16A16_SNORM] = { MALI_RGBA16_SNORM, _VT },
- [PIPE_FORMAT_R32G32B32A32_SNORM] = { MALI_RGBA32_SNORM, _VT },
-
- [PIPE_FORMAT_A8_SINT] = { MALI_R8I, _VTR },
- [PIPE_FORMAT_I8_SINT] = { MALI_R8I, _VTR },
- [PIPE_FORMAT_L8_SINT] = { MALI_R8I, _VTR },
- [PIPE_FORMAT_L8A8_SINT] = { MALI_RG8I, _VTR },
- [PIPE_FORMAT_A8_UINT] = { MALI_R8UI, _VTR },
- [PIPE_FORMAT_I8_UINT] = { MALI_R8UI, _VTR },
- [PIPE_FORMAT_L8_UINT] = { MALI_R8UI, _VTR },
- [PIPE_FORMAT_L8A8_UINT] = { MALI_RG8UI, _VTR },
-
- [PIPE_FORMAT_A16_SINT] = { MALI_R16I, _VTR },
- [PIPE_FORMAT_I16_SINT] = { MALI_R16I, _VTR },
- [PIPE_FORMAT_L16_SINT] = { MALI_R16I, _VTR },
- [PIPE_FORMAT_L16A16_SINT] = { MALI_RG16I, _VTR },
- [PIPE_FORMAT_A16_UINT] = { MALI_R16UI, _VTR },
- [PIPE_FORMAT_I16_UINT] = { MALI_R16UI, _VTR },
- [PIPE_FORMAT_L16_UINT] = { MALI_R16UI, _VTR },
- [PIPE_FORMAT_L16A16_UINT] = { MALI_RG16UI, _VTR },
-
- [PIPE_FORMAT_A32_SINT] = { MALI_R32I, _VTR },
- [PIPE_FORMAT_I32_SINT] = { MALI_R32I, _VTR },
- [PIPE_FORMAT_L32_SINT] = { MALI_R32I, _VTR },
- [PIPE_FORMAT_L32A32_SINT] = { MALI_RG32I, _VTR },
- [PIPE_FORMAT_A32_UINT] = { MALI_R32UI, _VTR },
- [PIPE_FORMAT_I32_UINT] = { MALI_R32UI, _VTR },
- [PIPE_FORMAT_L32_UINT] = { MALI_R32UI, _VTR },
- [PIPE_FORMAT_L32A32_UINT] = { MALI_RG32UI, _VTR },
-
- [PIPE_FORMAT_B8G8R8_UINT] = { MALI_RGB8UI, _VTR },
- [PIPE_FORMAT_B8G8R8A8_UINT] = { MALI_RGBA8UI, _VTR },
- [PIPE_FORMAT_B8G8R8_SINT] = { MALI_RGB8I, _VTR },
- [PIPE_FORMAT_B8G8R8A8_SINT] = { MALI_RGBA8I, _VTR },
- [PIPE_FORMAT_A8R8G8B8_UINT] = { MALI_RGBA8UI, _VTR },
- [PIPE_FORMAT_A8B8G8R8_UINT] = { MALI_RGBA8UI, _VTR },
-
- [PIPE_FORMAT_R8_UINT] = { MALI_R8UI, _VTR },
- [PIPE_FORMAT_R16_UINT] = { MALI_R16UI, _VTR },
- [PIPE_FORMAT_R32_UINT] = { MALI_R32UI, _VTR },
- [PIPE_FORMAT_R8G8_UINT] = { MALI_RG8UI, _VTR },
- [PIPE_FORMAT_R16G16_UINT] = { MALI_RG16UI, _VTR },
- [PIPE_FORMAT_R32G32_UINT] = { MALI_RG32UI, _VTR },
- [PIPE_FORMAT_R8G8B8_UINT] = { MALI_RGB8UI, _VTR },
- [PIPE_FORMAT_R16G16B16_UINT] = { MALI_RGB16UI, _VTR },
- [PIPE_FORMAT_R32G32B32_UINT] = { MALI_RGB32UI, _VTR },
- [PIPE_FORMAT_R8G8B8A8_UINT] = { MALI_RGBA8UI, _VTR },
- [PIPE_FORMAT_R16G16B16A16_UINT] = { MALI_RGBA16UI, _VTR },
- [PIPE_FORMAT_R32G32B32A32_UINT] = { MALI_RGBA32UI, _VTR },
-
- [PIPE_FORMAT_R32_FLOAT] = { MALI_R32F, _VTR },
- [PIPE_FORMAT_R32G32_FLOAT] = { MALI_RG32F, _VTR },
- [PIPE_FORMAT_R32G32B32_FLOAT] = { MALI_RGB32F, _VTR },
- [PIPE_FORMAT_R32G32B32A32_FLOAT] = { MALI_RGBA32F, _VTR },
-
- [PIPE_FORMAT_R8_UNORM] = { MALI_R8_UNORM, _VTR },
- [PIPE_FORMAT_R16_UNORM] = { MALI_R16_UNORM, _VTR },
- [PIPE_FORMAT_R32_UNORM] = { MALI_R32_UNORM, _VTR },
- [PIPE_FORMAT_R8G8_UNORM] = { MALI_RG8_UNORM, _VTR },
- [PIPE_FORMAT_R16G16_UNORM] = { MALI_RG16_UNORM, _VTR },
- [PIPE_FORMAT_R32G32_UNORM] = { MALI_RG32_UNORM, _VTR },
- [PIPE_FORMAT_R8G8B8_UNORM] = { MALI_RGB8_UNORM, _VTR },
- [PIPE_FORMAT_R16G16B16_UNORM] = { MALI_RGB16_UNORM, _VTR },
- [PIPE_FORMAT_R32G32B32_UNORM] = { MALI_RGB32_UNORM, _VTR },
- [PIPE_FORMAT_R4G4B4A4_UNORM] = { MALI_RGBA4_UNORM, _VTR },
- [PIPE_FORMAT_R16G16B16A16_UNORM] = { MALI_RGBA16_UNORM, _VTR },
- [PIPE_FORMAT_R32G32B32A32_UNORM] = { MALI_RGBA32_UNORM, _VTR },
-
- [PIPE_FORMAT_B8G8R8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_B8G8R8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_A8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_X8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_A8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_X8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_R8G8B8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_R8G8B8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
-
- [PIPE_FORMAT_R8G8B8X8_SNORM] = { MALI_RGBA8_SNORM, _VT },
- [PIPE_FORMAT_R8G8B8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_R8G8B8X8_UINT] = { MALI_RGBA8UI, _VTR },
- [PIPE_FORMAT_R8G8B8X8_SINT] = { MALI_RGBA8I, _VTR },
-
- [PIPE_FORMAT_L8_UNORM] = { MALI_R8_UNORM, _VTR },
- [PIPE_FORMAT_A8_UNORM] = { MALI_R8_UNORM, _VTR },
- [PIPE_FORMAT_I8_UNORM] = { MALI_R8_UNORM, _VTR },
- [PIPE_FORMAT_L8A8_UNORM] = { MALI_RG8_UNORM, _VTR },
- [PIPE_FORMAT_L16_UNORM] = { MALI_R16_UNORM, _VTR },
- [PIPE_FORMAT_A16_UNORM] = { MALI_R16_UNORM, _VTR },
- [PIPE_FORMAT_I16_UNORM] = { MALI_R16_UNORM, _VTR },
- [PIPE_FORMAT_L16A16_UNORM] = { MALI_RG16_UNORM, _VTR },
-
- [PIPE_FORMAT_L8_SNORM] = { MALI_R8_SNORM, _VT },
- [PIPE_FORMAT_A8_SNORM] = { MALI_R8_SNORM, _VT },
- [PIPE_FORMAT_I8_SNORM] = { MALI_R8_SNORM, _VT },
- [PIPE_FORMAT_L8A8_SNORM] = { MALI_RG8_SNORM, _VT },
- [PIPE_FORMAT_L16_SNORM] = { MALI_R16_SNORM, _VT },
- [PIPE_FORMAT_A16_SNORM] = { MALI_R16_SNORM, _VT },
- [PIPE_FORMAT_I16_SNORM] = { MALI_R16_SNORM, _VT },
- [PIPE_FORMAT_L16A16_SNORM] = { MALI_RG16_SNORM, _VT },
-
- [PIPE_FORMAT_L16_FLOAT] = { MALI_R16F, _VTR },
- [PIPE_FORMAT_A16_FLOAT] = { MALI_R16F, _VTR },
- [PIPE_FORMAT_I16_FLOAT] = { MALI_RG16F, _VTR },
- [PIPE_FORMAT_L16A16_FLOAT] = { MALI_RG16F, _VTR },
-
- [PIPE_FORMAT_L8_SRGB] = { MALI_R8_UNORM, _VTR },
- [PIPE_FORMAT_R8_SRGB] = { MALI_R8_UNORM, _VTR },
- [PIPE_FORMAT_L8A8_SRGB] = { MALI_RG8_UNORM, _VTR },
- [PIPE_FORMAT_R8G8_SRGB] = { MALI_RG8_UNORM, _VTR },
- [PIPE_FORMAT_R8G8B8_SRGB] = { MALI_RGB8_UNORM, _VTR },
- [PIPE_FORMAT_B8G8R8_SRGB] = { MALI_RGB8_UNORM, _VTR },
- [PIPE_FORMAT_R8G8B8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_A8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_X8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_B8G8R8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_B8G8R8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_A8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
- [PIPE_FORMAT_X8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
-
- [PIPE_FORMAT_R8_SINT] = { MALI_R8I, _VTR },
- [PIPE_FORMAT_R16_SINT] = { MALI_R16I, _VTR },
- [PIPE_FORMAT_R32_SINT] = { MALI_R32I, _VTR },
- [PIPE_FORMAT_R16_FLOAT] = { MALI_R16F, _VTR },
- [PIPE_FORMAT_R8G8_SINT] = { MALI_RG8I, _VTR },
- [PIPE_FORMAT_R16G16_SINT] = { MALI_RG16I, _VTR },
- [PIPE_FORMAT_R32G32_SINT] = { MALI_RG32I, _VTR },
- [PIPE_FORMAT_R16G16_FLOAT] = { MALI_RG16F, _VTR },
- [PIPE_FORMAT_R8G8B8_SINT] = { MALI_RGB8I, _VTR },
- [PIPE_FORMAT_R16G16B16_SINT] = { MALI_RGB16I, _VTR },
- [PIPE_FORMAT_R32G32B32_SINT] = { MALI_RGB32I, _VTR },
- [PIPE_FORMAT_R16G16B16_FLOAT] = { MALI_RGB16F, _VTR },
- [PIPE_FORMAT_R8G8B8A8_SINT] = { MALI_RGBA8I, _VTR },
- [PIPE_FORMAT_R16G16B16A16_SINT] = { MALI_RGBA16I, _VTR },
- [PIPE_FORMAT_R32G32B32A32_SINT] = { MALI_RGBA32I, _VTR },
- [PIPE_FORMAT_R16G16B16A16_FLOAT] = { MALI_RGBA16F, _VTR },
-
- [PIPE_FORMAT_R16G16B16X16_UNORM] = { MALI_RGBA16_UNORM, _VTR },
- [PIPE_FORMAT_R16G16B16X16_SNORM] = { MALI_RGBA16_SNORM, _VT },
- [PIPE_FORMAT_R16G16B16X16_FLOAT] = { MALI_RGBA16F, _VTR },
- [PIPE_FORMAT_R16G16B16X16_UINT] = { MALI_RGBA16UI, _VTR },
- [PIPE_FORMAT_R16G16B16X16_SINT] = { MALI_RGBA16I, _VTR },
-
- [PIPE_FORMAT_R32G32B32X32_FLOAT] = { MALI_RGBA32F, _VTR },
- [PIPE_FORMAT_R32G32B32X32_UINT] = { MALI_RGBA32UI, _VTR },
- [PIPE_FORMAT_R32G32B32X32_SINT] = { MALI_RGBA32I, _VTR },
-};
-
-#undef _VTR
-#undef _VT
-#undef _V
-#undef _T
-#undef _R
-
-/* Is a format encoded like Z24S8 and therefore compatible for render? */
-
-bool
-panfrost_is_z24s8_variant(enum pipe_format fmt)
-{
- switch (fmt) {
- case PIPE_FORMAT_Z24_UNORM_S8_UINT:
- case PIPE_FORMAT_Z24X8_UNORM:
- return true;
- default:
- return false;
- }
-}
-
-/* Translate a PIPE swizzle quad to a 12-bit Mali swizzle code. PIPE
- * swizzles line up with Mali swizzles for the XYZW01, but PIPE swizzles have
- * an additional "NONE" field that we have to mask out to zero. Additionally,
- * PIPE swizzles are sparse but Mali swizzles are packed */
-
-unsigned
-panfrost_translate_swizzle_4(const unsigned char swizzle[4])
-{
- unsigned out = 0;
-
- for (unsigned i = 0; i < 4; ++i) {
- unsigned translated = (swizzle[i] > PIPE_SWIZZLE_1) ? PIPE_SWIZZLE_0 : swizzle[i];
- out |= (translated << (3*i));
- }
-
- return out;
-}
-
-void
-panfrost_invert_swizzle(const unsigned char *in, unsigned char *out)
-{
- /* First, default to all zeroes to prevent uninitialized junk */
-
- for (unsigned c = 0; c < 4; ++c)
- out[c] = PIPE_SWIZZLE_0;
-
- /* Now "do" what the swizzle says */
-
- for (unsigned c = 0; c < 4; ++c) {
- unsigned char i = in[c];
-
- /* Who cares? */
- assert(PIPE_SWIZZLE_X == 0);
- if (i > PIPE_SWIZZLE_W)
- continue;
-
- /* Invert */
- unsigned idx = i - PIPE_SWIZZLE_X;
- out[idx] = PIPE_SWIZZLE_X + c;
- }
-}
-
-enum mali_format
-panfrost_format_to_bifrost_blend(const struct util_format_description *desc)
-{
- enum mali_format format = panfrost_pipe_format_table[desc->format].hw;
- assert(format);
-
- switch (format) {
- case MALI_RGBA4_UNORM:
- return MALI_RGBA4;
- case MALI_RGBA8_UNORM:
- case MALI_RGB8_UNORM:
- return MALI_RGBA8_2;
- case MALI_RGB10_A2_UNORM:
- return MALI_RGB10_A2_2;
- default:
- return format;
- }
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors (Collabora):
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- *
- */
-
-#include <assert.h>
-#include "util/u_math.h"
-#include "pan_encoder.h"
-
-/* Compute shaders are invoked with a gl_NumWorkGroups X/Y/Z triplet. Vertex
- * shaders, it turns out, are invoked with the same mechanism, with the triplet
- * (1, vertex_count, instance_count).
- *
- * Alongside this triplet is the gl_WorkGroupSize X/Y/Z triplet.
- *
- * Unfortunately, the packing for these triplet into the
- * mali_vertex_tiler_prefix is a little funky, using a dynamic bitfield. The
- * routines here exist to pack this */
-
-void
-panfrost_pack_work_groups_compute(
- struct mali_vertex_tiler_prefix *out,
- unsigned num_x,
- unsigned num_y,
- unsigned num_z,
- unsigned size_x,
- unsigned size_y,
- unsigned size_z,
- bool quirk_graphics)
-{
- uint32_t packed = 0;
-
- /* The values needing packing, in order, and the corresponding shifts.
- * Indicies into shift are off-by-one to make the logic easier */
-
- unsigned shifts[7] = { 0 };
-
- unsigned values[6] = {
- MALI_POSITIVE(size_x),
- MALI_POSITIVE(size_y),
- MALI_POSITIVE(size_z),
- MALI_POSITIVE(num_x),
- MALI_POSITIVE(num_y),
- MALI_POSITIVE(num_z),
- };
-
- for (unsigned i = 0; i < 6; ++i) {
- /* OR it in, shifting as required */
- packed |= (values[i] << shifts[i]);
-
- /* How many bits did we use? */
- unsigned bit_count = util_logbase2_ceil(values[i] + 1);
-
- /* Set the next shift accordingly */
- shifts[i + 1] = shifts[i] + bit_count;
- }
-
- /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift
- * = 32. This doesn't appear to matter to the hardware, but it's good
- * to be bit-identical. */
-
- if (quirk_graphics && (num_z <= 1))
- shifts[5] = 32;
-
- /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2,
- * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL
- * compute, it is always 2 if no barriers are in use, but is equal to
- * workgroups_x_shift is barriers are in use. */
-
- unsigned shift_2 = shifts[3];
-
- if (quirk_graphics)
- shift_2 = MAX2(shift_2, 2);
-
- /* Pack them in */
- uint32_t packed_shifts =
- (shifts[1] << 0) |
- (shifts[2] << 5) |
- (shifts[3] << 10) |
- (shifts[4] << 16) |
- (shifts[5] << 22) |
- (shift_2 << 28);
-
- /* Upload the packed bitfields */
- out->invocation_count = packed;
- out->invocation_shifts = packed_shifts;
-
- /* TODO: Compute workgroups_x_shift_3 */
- out->workgroups_x_shift_3 = shift_2;
-}
-
-/* Packs vertex/tiler descriptors simultaneously */
-void
-panfrost_pack_work_groups_fused(
- struct mali_vertex_tiler_prefix *vertex,
- struct mali_vertex_tiler_prefix *tiler,
- unsigned num_x,
- unsigned num_y,
- unsigned num_z,
- unsigned size_x,
- unsigned size_y,
- unsigned size_z)
-{
- panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z, true);
-
- /* Copy results over */
- tiler->invocation_count = vertex->invocation_count;
- tiler->invocation_shifts = vertex->invocation_shifts;
-
- /* Set special fields for each */
- vertex->workgroups_x_shift_3 = 5;
- tiler->workgroups_x_shift_3 = 6;
-}
-
+++ /dev/null
-/*
- * © Copyright 2018 Alyssa Rosenzweig
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include "util/hash_table.h"
-#include "pan_bo.h"
-#include "pan_pool.h"
-
-/* TODO: What does this actually have to be? */
-#define ALIGNMENT 128
-
-/* Transient command stream pooling: command stream uploads try to simply copy
- * into whereever we left off. If there isn't space, we allocate a new entry
- * into the pool and copy there */
-
-struct pan_pool
-panfrost_create_pool(void *memctx, struct panfrost_device *dev)
-{
- struct pan_pool pool = {
- .dev = dev,
- .transient_offset = 0,
- .transient_bo = NULL
- };
-
- pool.bos = _mesa_hash_table_create(memctx, _mesa_hash_pointer,
- _mesa_key_pointer_equal);
-
-
- return pool;
-}
-
-struct panfrost_transfer
-panfrost_pool_alloc(struct pan_pool *pool, size_t sz)
-{
- /* Pad the size */
- sz = ALIGN_POT(sz, ALIGNMENT);
-
- /* Find or create a suitable BO */
- struct panfrost_bo *bo = NULL;
-
- unsigned offset = 0;
-
- bool fits_in_current = (pool->transient_offset + sz) < TRANSIENT_SLAB_SIZE;
-
- if (likely(pool->transient_bo && fits_in_current)) {
- /* We can reuse the current BO, so get it */
- bo = pool->transient_bo;
-
- /* Use the specified offset */
- offset = pool->transient_offset;
- pool->transient_offset = offset + sz;
- } else {
- size_t bo_sz = sz < TRANSIENT_SLAB_SIZE ?
- TRANSIENT_SLAB_SIZE : ALIGN_POT(sz, 4096);
-
- /* We can't reuse the current BO, but we can create a new one.
- * We don't know what the BO will be used for, so let's flag it
- * RW and attach it to both the fragment and vertex/tiler jobs.
- * TODO: if we want fine grained BO assignment we should pass
- * flags to this function and keep the read/write,
- * fragment/vertex+tiler pools separate.
- */
- bo = panfrost_bo_create(pool->dev, bo_sz, 0);
-
- uintptr_t flags = PAN_BO_ACCESS_PRIVATE |
- PAN_BO_ACCESS_RW |
- PAN_BO_ACCESS_VERTEX_TILER |
- PAN_BO_ACCESS_FRAGMENT;
-
- _mesa_hash_table_insert(pool->bos, bo, (void *) flags);
-
- if (sz < TRANSIENT_SLAB_SIZE) {
- pool->transient_bo = bo;
- pool->transient_offset = offset + sz;
- }
- }
-
- struct panfrost_transfer ret = {
- .cpu = bo->cpu + offset,
- .gpu = bo->gpu + offset,
- };
-
- return ret;
-
-}
-
-mali_ptr
-panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz)
-{
- struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sz);
- memcpy(transfer.cpu, data, sz);
- return transfer.gpu;
-}
+++ /dev/null
-/*
- * © Copyright 2017-2018 Alyssa Rosenzweig
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PAN_POOL_H__
-#define __PAN_POOL_H__
-
-#include <stddef.h>
-#include <panfrost-job.h>
-
-/* Represents a pool of memory that can only grow, used to allocate objects
- * with the same lifetime as the pool itself. In OpenGL, a pool is owned by the
- * batch for transient structures. In Vulkan, it may be owned by e.g. the
- * command pool */
-
-struct pan_pool {
- /* Parent device for allocation */
- struct panfrost_device *dev;
-
- /* panfrost_bo -> access_flags owned by the pool */
- struct hash_table *bos;
-
- /* Current transient BO */
- struct panfrost_bo *transient_bo;
-
- /* Within the topmost transient BO, how much has been used? */
- unsigned transient_offset;
-};
-
-struct pan_pool
-panfrost_create_pool(void *memctx, struct panfrost_device *dev);
-
-/* Represents a fat pointer for GPU-mapped memory, returned from the transient
- * allocator and not used for much else */
-
-struct panfrost_transfer {
- uint8_t *cpu;
- mali_ptr gpu;
-};
-
-struct panfrost_transfer
-panfrost_pool_alloc(struct pan_pool *pool, size_t sz);
-
-mali_ptr
-panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz);
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include <xf86drm.h>
-
-#include "util/u_math.h"
-#include "util/macros.h"
-#include "util/hash_table.h"
-#include "util/u_thread.h"
-#include "drm-uapi/panfrost_drm.h"
-#include "pan_encoder.h"
-#include "pan_device.h"
-#include "panfrost-quirks.h"
-#include "pan_bo.h"
-
-/* Abstraction over the raw drm_panfrost_get_param ioctl for fetching
- * information about devices */
-
-static __u64
-panfrost_query_raw(
- int fd,
- enum drm_panfrost_param param,
- bool required,
- unsigned default_value)
-{
- struct drm_panfrost_get_param get_param = {0,};
- ASSERTED int ret;
-
- get_param.param = param;
- ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
-
- if (ret) {
- assert(!required);
- return default_value;
- }
-
- return get_param.value;
-}
-
-unsigned
-panfrost_query_gpu_version(int fd)
-{
- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0);
-}
-
-unsigned
-panfrost_query_core_count(int fd)
-{
- /* On older kernels, worst-case to 16 cores */
-
- unsigned mask = panfrost_query_raw(fd,
- DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);
-
- return util_bitcount(mask);
-}
-
-unsigned
-panfrost_query_thread_tls_alloc(int fd)
-{
- /* On older kernels, we worst-case to 256 threads, the architectural
- * maximum for Midgard. On my current kernel/hardware, I'm seeing this
- * readback as 0, so we'll worst-case there too */
-
- unsigned tls = panfrost_query_raw(fd,
- DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 256);
-
- if (tls)
- return tls;
- else
- return 256;
-}
-
-static uint32_t
-panfrost_query_compressed_formats(int fd)
-{
- /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and
- * should exist on any Mali configuration. All hardware should report
- * these texture formats but the kernel might not be new enough. */
-
- uint32_t default_set =
- (1 << MALI_ETC2_RGB8) |
- (1 << MALI_ETC2_R11_UNORM) |
- (1 << MALI_ETC2_RGBA8) |
- (1 << MALI_ETC2_RG11_UNORM) |
- (1 << MALI_ETC2_R11_SNORM) |
- (1 << MALI_ETC2_RG11_SNORM) |
- (1 << MALI_ETC2_RGB8A1) |
- (1 << MALI_ASTC_3D_LDR) |
- (1 << MALI_ASTC_3D_HDR) |
- (1 << MALI_ASTC_2D_LDR) |
- (1 << MALI_ASTC_2D_HDR);
-
- return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0,
- false, default_set);
-}
-
-/* DRM_PANFROST_PARAM_TEXTURE_FEATURES0 will return a bitmask of supported
- * compressed formats, so we offer a helper to test if a format is supported */
-
-bool
-panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt)
-{
- if (MALI_EXTRACT_TYPE(fmt) != MALI_FORMAT_COMPRESSED)
- return true;
-
- unsigned idx = fmt & ~MALI_FORMAT_COMPRESSED;
- assert(idx < 32);
-
- return dev->compressed_formats & (1 << idx);
-}
-
-/* Given a GPU ID like 0x860, return a prettified model name */
-
-const char *
-panfrost_model_name(unsigned gpu_id)
-{
- switch (gpu_id) {
- case 0x600: return "Mali T600 (Panfrost)";
- case 0x620: return "Mali T620 (Panfrost)";
- case 0x720: return "Mali T720 (Panfrost)";
- case 0x820: return "Mali T820 (Panfrost)";
- case 0x830: return "Mali T830 (Panfrost)";
- case 0x750: return "Mali T760 (Panfrost)";
- case 0x860: return "Mali T860 (Panfrost)";
- case 0x880: return "Mali T880 (Panfrost)";
- case 0x7093: return "Mali G31 (Panfrost)";
- case 0x7212: return "Mali G52 (Panfrost)";
- default:
- unreachable("Invalid GPU ID");
- }
-}
-
-void
-panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
-{
- dev->fd = fd;
- dev->memctx = memctx;
- dev->gpu_id = panfrost_query_gpu_version(fd);
- dev->core_count = panfrost_query_core_count(fd);
- dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd);
- dev->kernel_version = drmGetVersion(fd);
- dev->quirks = panfrost_get_quirks(dev->gpu_id);
- dev->compressed_formats = panfrost_query_compressed_formats(fd);
-
- util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512);
-
- pthread_mutex_init(&dev->bo_cache.lock, NULL);
- list_inithead(&dev->bo_cache.lru);
-
- for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i)
- list_inithead(&dev->bo_cache.buckets[i]);
-}
-
-void
-panfrost_close_device(struct panfrost_device *dev)
-{
- panfrost_bo_unreference(dev->blit_shaders.bo);
- panfrost_bo_cache_evict_all(dev);
- pthread_mutex_destroy(&dev->bo_cache.lock);
- drmFreeVersion(dev->kernel_version);
- util_sparse_array_finish(&dev->bo_map);
-
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include "pan_encoder.h"
-
-/* Sampler comparison functions are flipped in OpenGL from the hardware, so we
- * need to be able to flip accordingly */
-
-enum mali_func
-panfrost_flip_compare_func(enum mali_func f)
-{
- switch (f) {
- case MALI_FUNC_LESS:
- return MALI_FUNC_GREATER;
- case MALI_FUNC_GREATER:
- return MALI_FUNC_LESS;
- case MALI_FUNC_LEQUAL:
- return MALI_FUNC_GEQUAL;
- case MALI_FUNC_GEQUAL:
- return MALI_FUNC_LEQUAL;
- default:
- return f;
- }
-}
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include <string.h>
-#include "pan_scoreboard.h"
-#include "pan_device.h"
-#include "panfrost-quirks.h"
-
-/*
- * There are various types of Mali jobs:
- *
- * - WRITE_VALUE: generic write primitive, used to zero tiler field
- * - VERTEX: runs a vertex shader
- * - TILER: runs tiling and sets up a fragment shader
- * - FRAGMENT: runs fragment shaders and writes out
- * - COMPUTE: runs a compute shader
- * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
- * - GEOMETRY: runs a geometry shader (unimplemented)
- * - CACHE_FLUSH: unseen in the wild, theoretically cache flush
- *
- * In between a full batch and a single Mali job is the "job chain", a series
- * of Mali jobs together forming a linked list. Within the job chain, each Mali
- * job can set (up to) two dependencies on other earlier jobs in the chain.
- * This dependency graph forms a scoreboard. The general idea of a scoreboard
- * applies: when there is a data dependency of job B on job A, job B sets one
- * of its dependency indices to job A, ensuring that job B won't start until
- * job A finishes.
- *
- * More specifically, here are a set of rules:
- *
- * - A write value job must appear if and only if there is at least one tiler
- * job, and tiler jobs must depend on it.
- *
- * - Vertex jobs and tiler jobs are independent.
- *
- * - A tiler job must have a dependency on its data source. If it's getting
- * data from a vertex job, it depends on the vertex job. If it's getting data
- * from software, this is null.
- *
- * - Tiler jobs must depend on the write value job (chained or otherwise).
- *
- * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
- * previous job in the chain.
- *
- * - Jobs linking via next_job has no bearing on order of execution, rather it
- * just establishes the linked list of jobs, EXCEPT:
- *
- * - A job's dependencies must appear earlier in the linked list (job chain).
- *
- * Justification for each rule:
- *
- * - Write value jobs are used to write a zero into a magic tiling field, which
- * enables tiling to work. If tiling occurs, they are needed; if it does not,
- * we cannot emit them since then tiling partially occurs and it's bad.
- *
- * - The hardware has no notion of a "vertex/tiler job" (at least not our
- * hardware -- other revs have fused jobs, but --- crap, this just got even
- * more complicated). They are independent units that take in data, process
- * it, and spit out data.
- *
- * - Any job must depend on its data source, in fact, or risk a
- * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
- * tiler jobs depend on the corresponding vertex job (if it's there).
- *
- * - The tiler is not thread-safe; this dependency prevents race conditions
- * between two different jobs trying to write to the tiler outputs at the
- * same time.
- *
- * - Internally, jobs are scoreboarded; the next job fields just form a linked
- * list to allow the jobs to be read in; the execution order is from
- * resolving the dependency fields instead.
- *
- * - The hardware cannot set a dependency on a job it doesn't know about yet,
- * and dependencies are processed in-order of the next job fields.
- *
- */
-
-/* Generates, uploads, and queues a a new job. All fields are written in order
- * except for next_job accounting (TODO: Should we be clever and defer the
- * upload of the header here until next job to keep the access pattern totally
- * linear? Or is that just a micro op at this point?). Returns the generated
- * index for dep management.
- *
- * Inject is used to inject a job at the front, for wallpapering. If you are
- * not wallpapering and set this, dragons will eat you. */
-
-unsigned
-panfrost_new_job(
- struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- enum mali_job_type type,
- bool barrier,
- unsigned local_dep,
- void *payload, size_t payload_size,
- bool inject)
-{
- unsigned global_dep = 0;
-
- if (type == JOB_TYPE_TILER) {
- /* Tiler jobs must be chained, and on Midgard, the first tiler
- * job must depend on the write value job, whose index we
- * reserve now */
-
- if (scoreboard->tiler_dep)
- global_dep = scoreboard->tiler_dep;
- else if (!(pool->dev->quirks & IS_BIFROST)) {
- scoreboard->write_value_index = ++scoreboard->job_index;
- global_dep = scoreboard->write_value_index;
- }
- }
-
- /* Assign the index */
- unsigned index = ++scoreboard->job_index;
-
- struct mali_job_descriptor_header job = {
- .job_descriptor_size = 1,
- .job_type = type,
- .job_barrier = barrier,
- .job_index = index,
- .job_dependency_index_1 = local_dep,
- .job_dependency_index_2 = global_dep,
- };
-
- if (inject)
- job.next_job = scoreboard->first_job;
-
- struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + payload_size);
- memcpy(transfer.cpu, &job, sizeof(job));
- memcpy(transfer.cpu + sizeof(job), payload, payload_size);
-
- if (inject) {
- scoreboard->first_job = transfer.gpu;
- return index;
- }
-
- /* Form a chain */
- if (type == JOB_TYPE_TILER)
- scoreboard->tiler_dep = index;
-
- if (scoreboard->prev_job)
- scoreboard->prev_job->next_job = transfer.gpu;
- else
- scoreboard->first_job = transfer.gpu;
-
- scoreboard->prev_job = (struct mali_job_descriptor_header *) transfer.cpu;
- return index;
-}
-
-/* Generates a write value job, used to initialize the tiler structures. Note
- * this is called right before frame submission. */
-
-void
-panfrost_scoreboard_initialize_tiler(struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- mali_ptr polygon_list)
-{
- /* Check if we even need tiling */
- if (pool->dev->quirks & IS_BIFROST || !scoreboard->tiler_dep)
- return;
-
- /* Okay, we do. Let's generate it. We'll need the job's polygon list
- * regardless of size. */
-
- struct mali_job_descriptor_header job = {
- .job_type = JOB_TYPE_WRITE_VALUE,
- .job_index = scoreboard->write_value_index,
- .job_descriptor_size = 1,
- .next_job = scoreboard->first_job
- };
-
- struct mali_payload_write_value payload = {
- .address = polygon_list,
- .value_descriptor = MALI_WRITE_VALUE_ZERO,
- };
-
- struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + sizeof(payload));
- memcpy(transfer.cpu, &job, sizeof(job));
- memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
-
- scoreboard->first_job = transfer.gpu;
-}
+++ /dev/null
-/*
- * Copyright (C) 2019-2020 Collabora Ltd.
- * Copyright (C) 2019 Alyssa Rosenzweig
- * Copyright (C) 2014-2017 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PAN_SCOREBOARD_H__
-#define __PAN_SCOREBOARD_H__
-
-#include "panfrost-job.h"
-#include "pan_pool.h"
-
-struct pan_scoreboard {
- /* The first job in the batch */
- mali_ptr first_job;
-
- /* The number of jobs in the primary batch, essentially */
- unsigned job_index;
-
- /* A CPU-side pointer to the previous job for next_job linking */
- struct mali_job_descriptor_header *prev_job;
-
- /* The dependency for tiler jobs (i.e. the index of the last emitted
- * tiler job, or zero if none have been emitted) */
- unsigned tiler_dep;
-
- /* The job index of the WRITE_VALUE job (before it has been created) */
- unsigned write_value_index;
-};
-
-unsigned
-panfrost_new_job(
- struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- enum mali_job_type type,
- bool barrier,
- unsigned local_dep,
- void *payload, size_t payload_size,
- bool inject);
-
-void panfrost_scoreboard_initialize_tiler(
- struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- mali_ptr polygon_list);
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include "util/u_math.h"
-#include "pan_encoder.h"
-
-/* Midgard has a small register file, so shaders with high register pressure
- * need to spill from the register file onto the stack. In addition to
- * spilling, it is desireable to allocate temporary arrays on the stack (for
- * instance because the register file does not support indirect access but the
- * stack does).
- *
- * The stack is located in "Thread Local Storage", sometimes abbreviated TLS in
- * the kernel source code. Thread local storage is allocated per-thread,
- * per-core, so threads executing concurrently do not interfere with each
- * other's stacks. On modern kernels, we may query
- * DRM_PANFROST_PARAM_THREAD_TLS_ALLOC for the number of threads per core we
- * must allocate for, and DRM_PANFROST_PARAM_SHADER_PRESENT for a bitmask of
- * shader cores (so take a popcount of that mask for the number of shader
- * cores). On older kernels that do not support querying these values,
- * following kbase, we may use the worst-case value of 256 threads for
- * THREAD_TLS_ALLOC, and the worst-case value of 16 cores for Midgard per the
- * "shader core count" column of the implementations table in
- * https://en.wikipedia.org/wiki/Mali_%28GPU% [citation needed]
- *
- * Within a particular thread, there is stack allocated. If it is present, its
- * size is a power-of-two, and it is at least 16 bytes. Stack is allocated
- * with the shared memory descriptor used for all shaders within a frame (note
- * that they don't execute concurrently so it's fine). So, consider the maximum
- * stack size used by any shader within a job, and then compute (where npot
- * denotes the next power of two):
- *
- * bytes/thread = npot(max(size, 16))
- * allocated = (# of bytes/thread) * (# of threads/core) * (# of cores)
- *
- * The size of Thread Local Storage is signaled to the GPU in a dedicated
- * log_stack_size field. Since stack sizes are powers of two, it follows that
- * stack_size is logarithmic. Consider some sample values:
- *
- * stack size | log_stack_size
- * ---------------------------
- * 256 | 4
- * 512 | 5
- * 1024 | 6
- *
- * Noting that log2(256) = 8, we have the relation:
- *
- * stack_size <= 2^(log_stack_size + 4)
- *
- * Given the constraints about powers-of-two and the minimum of 256, we thus
- * derive a formula for log_stack_size in terms of stack size (s), where s is
- * positive:
- *
- * log_stack_size = ceil(log2(max(s, 16))) - 4
- *
- * There are other valid characterisations of this formula, of course, but this
- * is computationally simple, so good enough for our purposes. If s=0, since
- * there is no spilling used whatsoever, we may set log_stack_size to 0 to
- * disable the stack.
- */
-
-/* Computes log_stack_size = ceil(log2(max(s, 16))) - 4 */
-
-unsigned
-panfrost_get_stack_shift(unsigned stack_size)
-{
- if (stack_size)
- return util_logbase2_ceil(MAX2(stack_size, 16)) - 4;
- else
- return 0;
-}
-
-/* Computes the aligned stack size given the shift and thread count. The blob
- * reserves an extra page, and since this is hardware-internal, we do too. */
-
-unsigned
-panfrost_get_total_stack_size(
- unsigned stack_shift,
- unsigned threads_per_core,
- unsigned core_count)
-{
- unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32);
- unsigned size = size_per_thread * threads_per_core * core_count;
-
- return size + 4096;
-}
+++ /dev/null
-/*
- * Copyright (C) 2008 VMware, Inc.
- * Copyright (C) 2014 Broadcom
- * Copyright (C) 2018-2019 Alyssa Rosenzweig
- * Copyright (C) 2019-2020 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#include "util/macros.h"
-#include "util/u_math.h"
-#include "pan_texture.h"
-
-/* Generates a texture descriptor. Ideally, descriptors are immutable after the
- * texture is created, so we can keep these hanging around in GPU memory in a
- * dedicated BO and not have to worry. In practice there are some minor gotchas
- * with this (the driver sometimes will change the format of a texture on the
- * fly for compression) but it's fast enough to just regenerate the descriptor
- * in those cases, rather than monkeypatching at drawtime.
- *
- * A texture descriptor consists of a 32-byte mali_texture_descriptor structure
- * followed by a variable number of pointers. Due to this variance and
- * potentially large size, we actually upload directly rather than returning
- * the descriptor. Whether the user does a copy themselves or not is irrelevant
- * to us here.
- */
-
-/* Check if we need to set a custom stride by computing the "expected"
- * stride and comparing it to what the user actually wants. Only applies
- * to linear textures, since tiled/compressed textures have strict
- * alignment requirements for their strides as it is */
-
-static bool
-panfrost_needs_explicit_stride(
- struct panfrost_slice *slices,
- uint16_t width,
- unsigned first_level, unsigned last_level,
- unsigned bytes_per_pixel)
-{
- for (unsigned l = first_level; l <= last_level; ++l) {
- unsigned actual = slices[l].stride;
- unsigned expected = u_minify(width, l) * bytes_per_pixel;
-
- if (actual != expected)
- return true;
- }
-
- return false;
-}
-
-/* A Scalable Texture Compression (ASTC) corresponds to just a few texture type
- * in the hardware, but in fact can be parametrized to have various widths and
- * heights for the so-called "stretch factor". It turns out these parameters
- * are stuffed in the bottom bits of the payload pointers. This functions
- * computes these magic stuffing constants based on the ASTC format in use. The
- * constant in a given dimension is 3-bits, and two are stored side-by-side for
- * each active dimension.
- */
-
-static unsigned
-panfrost_astc_stretch(unsigned dim)
-{
- assert(dim >= 4 && dim <= 12);
- return MIN2(dim, 11) - 4;
-}
-
-/* Texture addresses are tagged with information about compressed formats.
- * AFBC uses a bit for whether the colorspace transform is enabled (RGB and
- * RGBA only).
- * For ASTC, this is a "stretch factor" encoding the block size. */
-
-static unsigned
-panfrost_compression_tag(
- const struct util_format_description *desc,
- enum mali_format format, enum mali_texture_layout layout)
-{
- if (layout == MALI_TEXTURE_AFBC)
- return desc->nr_channels >= 3;
- else if (format == MALI_ASTC_2D_LDR || format == MALI_ASTC_2D_HDR)
- return (panfrost_astc_stretch(desc->block.height) << 3) |
- panfrost_astc_stretch(desc->block.width);
- else
- return 0;
-}
-
-
-/* Cubemaps have 6 faces as "layers" in between each actual layer. We
- * need to fix this up. TODO: logic wrong in the asserted out cases ...
- * can they happen, perhaps from cubemap arrays? */
-
-static void
-panfrost_adjust_cube_dimensions(
- unsigned *first_face, unsigned *last_face,
- unsigned *first_layer, unsigned *last_layer)
-{
- *first_face = *first_layer % 6;
- *last_face = *last_layer % 6;
- *first_layer /= 6;
- *last_layer /= 6;
-
- assert((*first_layer == *last_layer) || (*first_face == 0 && *last_face == 5));
-}
-
-/* Following the texture descriptor is a number of pointers. How many? */
-
-static unsigned
-panfrost_texture_num_elements(
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- bool is_cube, bool manual_stride)
-{
- unsigned first_face = 0, last_face = 0;
-
- if (is_cube) {
- panfrost_adjust_cube_dimensions(&first_face, &last_face,
- &first_layer, &last_layer);
- }
-
- unsigned levels = 1 + last_level - first_level;
- unsigned layers = 1 + last_layer - first_layer;
- unsigned faces = 1 + last_face - first_face;
- unsigned num_elements = levels * layers * faces * MAX2(nr_samples, 1);
-
- if (manual_stride)
- num_elements *= 2;
-
- return num_elements;
-}
-
-/* Conservative estimate of the size of the texture payload a priori.
- * Average case, size equal to the actual size. Worst case, off by 2x (if
- * a manual stride is not needed on a linear texture). Returned value
- * must be greater than or equal to the actual size, so it's safe to use
- * as an allocation amount */
-
-unsigned
-panfrost_estimate_texture_payload_size(
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- enum mali_texture_type type, enum mali_texture_layout layout)
-{
- /* Assume worst case */
- unsigned manual_stride = (layout == MALI_TEXTURE_LINEAR);
-
- unsigned elements = panfrost_texture_num_elements(
- first_level, last_level,
- first_layer, last_layer,
- nr_samples,
- type == MALI_TEX_CUBE, manual_stride);
-
- return sizeof(mali_ptr) * elements;
-}
-
-/* Bifrost requires a tile stride for tiled textures. This stride is computed
- * as (16 * bpp * width) assuming there is at least one tile (width >= 16).
- * Otherwise if height <= 16, the blob puts zero. Interactions with AFBC are
- * currently unknown.
- */
-
-static unsigned
-panfrost_nonlinear_stride(enum mali_texture_layout layout,
- unsigned bytes_per_pixel,
- unsigned width,
- unsigned height)
-{
- if (layout == MALI_TEXTURE_TILED) {
- return (height <= 16) ? 0 : (16 * bytes_per_pixel * ALIGN_POT(width, 16));
- } else {
- unreachable("TODO: AFBC on Bifrost");
- }
-}
-
-static void
-panfrost_emit_texture_payload(
- mali_ptr *payload,
- const struct util_format_description *desc,
- enum mali_format mali_format,
- enum mali_texture_type type,
- enum mali_texture_layout layout,
- unsigned width, unsigned height,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- unsigned cube_stride,
- bool manual_stride,
- mali_ptr base,
- struct panfrost_slice *slices)
-{
- base |= panfrost_compression_tag(desc, mali_format, layout);
-
- /* Inject the addresses in, interleaving array indices, mip levels,
- * cube faces, and strides in that order */
-
- unsigned first_face = 0, last_face = 0, face_mult = 1;
-
- if (type == MALI_TEX_CUBE) {
- face_mult = 6;
- panfrost_adjust_cube_dimensions(&first_face, &last_face, &first_layer, &last_layer);
- }
-
- nr_samples = MAX2(nr_samples, 1);
-
- unsigned idx = 0;
-
- for (unsigned w = first_layer; w <= last_layer; ++w) {
- for (unsigned l = first_level; l <= last_level; ++l) {
- for (unsigned f = first_face; f <= last_face; ++f) {
- for (unsigned s = 0; s < nr_samples; ++s) {
- payload[idx++] = base + panfrost_texture_offset(
- slices, type == MALI_TEX_3D,
- cube_stride, l, w * face_mult + f, s);
-
- if (manual_stride) {
- payload[idx++] = (layout == MALI_TEXTURE_LINEAR) ?
- slices[l].stride :
- panfrost_nonlinear_stride(layout,
- MAX2(desc->block.bits / 8, 1),
- u_minify(width, l),
- u_minify(height, l));
- }
- }
- }
- }
- }
-}
-
-#define MALI_SWIZZLE_R001 \
- (MALI_CHANNEL_RED << 0) | \
- (MALI_CHANNEL_ZERO << 3) | \
- (MALI_CHANNEL_ZERO << 6) | \
- (MALI_CHANNEL_ONE << 9)
-
-#define MALI_SWIZZLE_A001 \
- (MALI_CHANNEL_ALPHA << 0) | \
- (MALI_CHANNEL_ZERO << 3) | \
- (MALI_CHANNEL_ZERO << 6) | \
- (MALI_CHANNEL_ONE << 9)
-
-
-void
-panfrost_new_texture(
- void *out,
- uint16_t width, uint16_t height,
- uint16_t depth, uint16_t array_size,
- enum pipe_format format,
- enum mali_texture_type type,
- enum mali_texture_layout layout,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- unsigned cube_stride,
- unsigned swizzle,
- mali_ptr base,
- struct panfrost_slice *slices)
-{
- const struct util_format_description *desc =
- util_format_description(format);
-
- unsigned bytes_per_pixel = util_format_get_blocksize(format);
-
- enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw;
- assert(mali_format);
-
- bool manual_stride = (layout == MALI_TEXTURE_LINEAR)
- && panfrost_needs_explicit_stride(slices, width,
- first_level, last_level, bytes_per_pixel);
-
- struct mali_texture_descriptor descriptor = {
- .width = MALI_POSITIVE(u_minify(width, first_level)),
- .height = MALI_POSITIVE(u_minify(height, first_level)),
- .depth = MALI_POSITIVE(u_minify(depth, first_level)),
- .array_size = MALI_POSITIVE(array_size),
- .format = {
- .swizzle = (format == PIPE_FORMAT_X24S8_UINT) ?
- MALI_SWIZZLE_A001 :
- (format == PIPE_FORMAT_S8_UINT) ?
- MALI_SWIZZLE_R001 :
- panfrost_translate_swizzle_4(desc->swizzle),
- .format = mali_format,
- .srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB),
- .type = type,
- .layout = layout,
- .manual_stride = manual_stride,
- .unknown2 = 1,
- },
- .levels = last_level - first_level,
- .swizzle = swizzle
- };
-
- memcpy(out, &descriptor, sizeof(descriptor));
-
- mali_ptr *payload = (mali_ptr *) (out + sizeof(struct mali_texture_descriptor));
- panfrost_emit_texture_payload(
- payload,
- desc,
- mali_format,
- type,
- layout,
- width, height,
- first_level, last_level,
- first_layer, last_layer,
- nr_samples,
- cube_stride,
- manual_stride,
- base,
- slices);
-}
-
-void
-panfrost_new_texture_bifrost(
- struct bifrost_texture_descriptor *descriptor,
- uint16_t width, uint16_t height,
- uint16_t depth, uint16_t array_size,
- enum pipe_format format,
- enum mali_texture_type type,
- enum mali_texture_layout layout,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- unsigned cube_stride,
- unsigned swizzle,
- mali_ptr base,
- struct panfrost_slice *slices,
- struct panfrost_bo *payload)
-{
- const struct util_format_description *desc =
- util_format_description(format);
-
- enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw;
- assert(mali_format);
-
- panfrost_emit_texture_payload(
- (mali_ptr *) payload->cpu,
- desc,
- mali_format,
- type,
- layout,
- width, height,
- first_level, last_level,
- first_layer, last_layer,
- nr_samples,
- cube_stride,
- true, /* Stride explicit on Bifrost */
- base,
- slices);
-
- descriptor->format_unk = 0x2;
- descriptor->type = type;
- descriptor->format = mali_format;
- descriptor->srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
- descriptor->format_unk3 = 0x0;
- descriptor->width = MALI_POSITIVE(u_minify(width, first_level));
- descriptor->height = MALI_POSITIVE(u_minify(height, first_level));
- descriptor->swizzle = swizzle;
- descriptor->layout = layout;
- descriptor->levels = last_level - first_level;
- descriptor->unk1 = 0x0;
- descriptor->levels_unk = 0;
- descriptor->level_2 = last_level - first_level;
- descriptor->payload = payload->gpu;
- descriptor->array_size = MALI_POSITIVE(array_size);
- descriptor->unk4 = 0x0;
- descriptor->depth = MALI_POSITIVE(u_minify(depth, first_level));
- descriptor->unk5 = 0x0;
-}
-
-/* Computes sizes for checksumming, which is 8 bytes per 16x16 tile.
- * Checksumming is believed to be a CRC variant (CRC64 based on the size?).
- * This feature is also known as "transaction elimination". */
-
-#define CHECKSUM_TILE_WIDTH 16
-#define CHECKSUM_TILE_HEIGHT 16
-#define CHECKSUM_BYTES_PER_TILE 8
-
-unsigned
-panfrost_compute_checksum_size(
- struct panfrost_slice *slice,
- unsigned width,
- unsigned height)
-{
- unsigned aligned_width = ALIGN_POT(width, CHECKSUM_TILE_WIDTH);
- unsigned aligned_height = ALIGN_POT(height, CHECKSUM_TILE_HEIGHT);
-
- unsigned tile_count_x = aligned_width / CHECKSUM_TILE_WIDTH;
- unsigned tile_count_y = aligned_height / CHECKSUM_TILE_HEIGHT;
-
- slice->checksum_stride = tile_count_x * CHECKSUM_BYTES_PER_TILE;
-
- return slice->checksum_stride * tile_count_y;
-}
-
-unsigned
-panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level)
-{
- return is_3d ? slices[level].size0 : cube_stride;
-}
-
-/* Computes the offset into a texture at a particular level/face. Add to
- * the base address of a texture to get the address to that level/face */
-
-unsigned
-panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample)
-{
- unsigned layer_stride = panfrost_get_layer_stride(slices, is_3d, cube_stride, level);
- return slices[level].offset + (face * layer_stride) + (sample * slices[level].size0);
-}
+++ /dev/null
-/*
- * Copyright (C) 2008 VMware, Inc.
- * Copyright (C) 2014 Broadcom
- * Copyright (C) 2018-2019 Alyssa Rosenzweig
- * Copyright (C) 2019-2020 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-
-#ifndef __PAN_TEXTURE_H
-#define __PAN_TEXTURE_H
-
-#include <stdbool.h>
-#include "util/format/u_format.h"
-#include "compiler/shader_enums.h"
-#include "panfrost-job.h"
-#include "pan_bo.h"
-
-struct panfrost_slice {
- unsigned offset;
- unsigned stride;
- unsigned size0;
-
- /* If there is a header preceding each slice, how big is
- * that header? Used for AFBC */
- unsigned header_size;
-
- /* If checksumming is enabled following the slice, what
- * is its offset/stride? */
- unsigned checksum_offset;
- unsigned checksum_stride;
- struct panfrost_bo *checksum_bo;
-
- /* Has anything been written to this slice? */
- bool initialized;
-};
-
-struct pan_image {
- /* Format and size */
- uint16_t width0, height0, depth0, array_size;
- enum pipe_format format;
- enum mali_texture_type type;
- unsigned first_level, last_level;
- unsigned first_layer, last_layer;
- unsigned nr_samples;
- struct panfrost_bo *bo;
- struct panfrost_slice *slices;
- unsigned cubemap_stride;
- enum mali_texture_layout layout;
-};
-
-unsigned
-panfrost_compute_checksum_size(
- struct panfrost_slice *slice,
- unsigned width,
- unsigned height);
-
-/* AFBC */
-
-bool
-panfrost_format_supports_afbc(enum pipe_format format);
-
-unsigned
-panfrost_afbc_header_size(unsigned width, unsigned height);
-
-/* mali_texture_descriptor */
-
-unsigned
-panfrost_estimate_texture_payload_size(
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- enum mali_texture_type type, enum mali_texture_layout layout);
-
-void
-panfrost_new_texture(
- void *out,
- uint16_t width, uint16_t height,
- uint16_t depth, uint16_t array_size,
- enum pipe_format format,
- enum mali_texture_type type,
- enum mali_texture_layout layout,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- unsigned cube_stride,
- unsigned swizzle,
- mali_ptr base,
- struct panfrost_slice *slices);
-
-void
-panfrost_new_texture_bifrost(
- struct bifrost_texture_descriptor *descriptor,
- uint16_t width, uint16_t height,
- uint16_t depth, uint16_t array_size,
- enum pipe_format format,
- enum mali_texture_type type,
- enum mali_texture_layout layout,
- unsigned first_level, unsigned last_level,
- unsigned first_layer, unsigned last_layer,
- unsigned nr_samples,
- unsigned cube_stride,
- unsigned swizzle,
- mali_ptr base,
- struct panfrost_slice *slices,
- struct panfrost_bo *payload);
-
-
-unsigned
-panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level);
-
-unsigned
-panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample);
-
-/* Formats */
-
-struct panfrost_format {
- enum mali_format hw;
- unsigned bind;
-};
-
-extern struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT];
-
-bool
-panfrost_is_z24s8_variant(enum pipe_format fmt);
-
-unsigned
-panfrost_translate_swizzle_4(const unsigned char swizzle[4]);
-
-void
-panfrost_invert_swizzle(const unsigned char *in, unsigned char *out);
-
-static inline unsigned
-panfrost_get_default_swizzle(unsigned components)
-{
- switch (components) {
- case 1:
- return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_ZERO << 3) |
- (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9);
- case 2:
- return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) |
- (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9);
- case 3:
- return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) |
- (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ONE << 9);
- case 4:
- return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) |
- (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9);
- default:
- unreachable("Invalid number of components");
- }
-}
-
-static inline unsigned
-panfrost_bifrost_swizzle(unsigned components)
-{
- /* Set all components to 0 and force w if needed */
- return components < 4 ? 0x10 : 0x00;
-}
-
-enum mali_format
-panfrost_format_to_bifrost_blend(const struct util_format_description *desc);
-
-struct pan_pool;
-struct pan_scoreboard;
-
-void
-panfrost_init_blit_shaders(struct panfrost_device *dev);
-
-void
-panfrost_load_midg(
- struct pan_pool *pool,
- struct pan_scoreboard *scoreboard,
- mali_ptr blend_shader,
- mali_ptr fbd,
- mali_ptr coordinates, unsigned vertex_count,
- struct pan_image *image,
- unsigned loc);
-
-#endif
+++ /dev/null
-/*
- * Copyright (C) 2019 Collabora, Ltd.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
- */
-
-#include "util/u_math.h"
-#include "util/macros.h"
-#include "pan_encoder.h"
-
-/* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
- * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
- * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
- * results. For each triangle specified, it marks each containing tile as
- * containing that triangle. This set of "triangles per tile" form the "polygon
- * list". Finally, the rasterization unit consumes the polygon list to invoke
- * the fragment shader.
- *
- * In practice, it's a bit more complicated than this. On Midgard chips with an
- * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical
- * tile size, but Midgard features "hierarchical tiling", where power-of-two
- * multiples of the base tile size can be used: hierarchy level 0 (16x16),
- * level 1 (32x32), level 2 (64x64), per public information about Midgard's
- * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice
- * 128x128 is the largest usually used (though higher modes are enabled). The
- * idea behind hierarchical tiling is to use low tiling levels for small
- * triangles and high levels for large triangles, to minimize memory bandwidth
- * and repeated fragment shader invocations (the former issue inherent to
- * immediate-mode rendering and the latter common in traditional tilers).
- *
- * The tiler itself works by reading varyings in and writing a polygon list
- * out. Unfortunately (for us), both of these buffers are managed in main
- * memory; although they ideally will be cached, it is the drivers'
- * responsibility to allocate these buffers. Varying buffer allocation is
- * handled elsewhere, as it is not tiler specific; the real issue is allocating
- * the polygon list.
- *
- * This is hard, because from the driver's perspective, we have no information
- * about what geometry will actually look like on screen; that information is
- * only gained from running the vertex shader. (Theoretically, we could run the
- * vertex shaders in software as a prepass, or in hardware with transform
- * feedback as a prepass, but either idea is ludicrous on so many levels).
- *
- * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
- * into three distinct pieces. First, the driver statically determines which
- * tile hierarchy levels to use (more on that later). At this point, we know the
- * framebuffer dimensions and all the possible tilings of the framebuffer, so
- * we know exactly how many tiles exist across all hierarchy levels. The first
- * piece of the polygon list is the header, which is exactly 8 bytes per tile,
- * plus padding and a small 64-byte prologue. (If that doesn't remind you of
- * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
- * the polygon list body, which seems to contain 512 bytes per tile, again
- * across every level of the hierarchy. These two parts form the polygon list
- * buffer. This buffer has a statically determinable size, approximately equal
- * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
- * alignment / minimum restrictions / etc.
- *
- * The third piece is the easy one (for us): the tiler heap. In essence, the
- * tiler heap is a gigantic slab that's as big as could possibly be necessary
- * in the worst case imaginable. Just... a gigantic allocation that we give a
- * start and end pointer to. What's the catch? The tiler heap is lazily
- * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
- * bit is actually allocated upfront. The GPU just keeps using the
- * unallocated-but-reserved portions as it goes along, generating page faults
- * if it goes beyond the allocation, and then the kernel is instructed to
- * expand the allocation on page fault (known in the vendor kernel as growable
- * memory). This is quite a bit of bookkeeping of its own, but that task is
- * pushed to kernel space and we can mostly ignore it here, just remembering to
- * set the GROWABLE flag so the kernel actually uses this path rather than
- * allocating a gigantic amount up front and burning a hole in RAM.
- *
- * As far as determining which hierarchy levels to use, the simple answer is
- * that right now, we don't. In the tiler configuration fields (consistent from
- * the earliest Midgard's SFBD through the latest Bifrost traces we have),
- * there is a hierarchy_mask field, controlling which levels (tile sizes) are
- * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
- * big tiles and small polygons to small tiles -- would be realized here as
- * well. As long as there are polygons at all needing tiling, we always have to
- * have big tiles available, in case there are big polygons. But we don't
- * necessarily need small tiles available. Ideally, when there are small
- * polygons, small tiles are enabled (to avoid waste from putting small
- * triangles in the big tiles); when there are not, small tiles are disabled to
- * avoid enabling more levels than necessary, which potentially costs in memory
- * bandwidth / power / tiler performance.
- *
- * Of course, the driver has to figure this out statically. When tile
- * hiearchies are actually established, this occurs by the tiler in
- * fixed-function hardware, after the vertex shaders have run and there is
- * sufficient information to figure out the size of triangles. The driver has
- * no such luxury, again barring insane hacks like additionally running the
- * vertex shaders in software or in hardware via transform feedback. Thus, for
- * the driver, we need a heuristic approach.
- *
- * There are lots of heuristics to guess triangle size statically you could
- * imagine, but one approach shines as particularly simple-stupid: assume all
- * on-screen triangles are equal size and spread equidistantly throughout the
- * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
- * it, then we see:
- *
- * Triangle Area = (Screen Area / # of triangles)
- * = (Width * Height) / (# of triangles)
- *
- * Or if you prefer, we can also make a third CRAZY assumption that we only draw
- * right triangles with edges parallel/perpendicular to the sides of the screen
- * with no overdraw, forming a triangle grid across the screen:
- *
- * |--w--|
- * _____ |
- * | /| /| |
- * |/_|/_| h
- * | /| /| |
- * |/_|/_| |
- *
- * Then you can use some middle school geometry and algebra to work out the
- * triangle dimensions. I started working on this, but realised I didn't need
- * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
- *
- * POINT IS, by considering the ratio of screen area and triangle count, we can
- * estimate the triangle size. For a small size, use small bins; for a large
- * size, use large bins. Intuitively, this metric makes sense: when there are
- * few triangles on a large screen, you're probably compositing a UI and
- * therefore the triangles are large; when there are a lot of triangles on a
- * small screen, you're probably rendering a 3D mesh and therefore the
- * triangles are tiny. (Or better said -- there will be tiny triangles, even if
- * there are also large triangles. There have to be unless you expect crazy
- * overdraw. Generally, it's better to allow more small bin sizes than
- * necessary than not allow enough.)
- *
- * From this heuristic (or whatever), we determine the minimum allowable tile
- * size, and we use that to decide the hierarchy masking, selecting from the
- * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice).
- *
- * Once we have that mask and the framebuffer dimensions, we can compute the
- * size of the statically-sized polygon list structures, allocate them, and go!
- *
- * -----
- *
- * On T720, T820, and T830, there is no support for hierarchical tiling.
- * Instead, the hardware allows the driver to select the tile size dynamically
- * on a per-framebuffer basis, including allowing rectangular/non-square tiles.
- * Rules for tile size selection are as follows:
- *
- * - Dimensions must be powers-of-two.
- * - The smallest tile is 16x16.
- * - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix)
- * - There must be no more than 64 tiles in either dimension.
- *
- * Within these constraints, the driver is free to pick a tile size according
- * to some heuristic, similar to units with an advanced tiling unit.
- *
- * To pick a size without any heuristics, we may satisfy the constraints by
- * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size
- * constraint, consider:
- *
- * # of tiles < 64
- * ceil (fb / tile) < 64
- * (fb / tile) <= (64 - 1)
- * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1))
- *
- * Hence we clamp up to align_pot(fb / (64 - 1)).
-
- * Extending to use a selection heuristic left for future work.
- *
- * Once the tile size (w, h) is chosen, we compute the hierarchy "mask":
- *
- * hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16)
- *
- * Of course with no hierarchical tiling, this is not a mask; it's just a field
- * specifying the tile size. But I digress.
- *
- * We also compute the polgon list sizes (with framebuffer size W, H) as:
- *
- * full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h)
- * offset = 8 * ceil(W / w) * ceil(H / h)
- *
- * It further appears necessary to round down offset to the nearest 0x200.
- * Possibly we would also round down full_size to the nearest 0x200 but
- * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's
- * nothing to do.
- */
-
-/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */
-
-#define MIN_TILE_SIZE 16
-#define MAX_TILE_SIZE 4096
-
-/* Constants as shifts for easier power-of-two iteration */
-
-#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
-#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)
-
-/* The hierarchy has a 64-byte prologue */
-#define PROLOGUE_SIZE 0x40
-
-/* For each tile (across all hierarchy levels), there is 8 bytes of header */
-#define HEADER_BYTES_PER_TILE 0x8
-
-/* Likewise, each tile per level has 512 bytes of body */
-#define FULL_BYTES_PER_TILE 0x200
-
-/* If the width-x-height framebuffer is divided into tile_size-x-tile_size
- * tiles, how many tiles are there? Rounding up in each direction. For the
- * special case of tile_size=16, this aligns with the usual Midgard count.
- * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
- * because those care about the stride (not just the overall count) and only at
- * a a fixed-tile size (not any of a number of power-of-twos) */
-
-static unsigned
-pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height)
-{
- unsigned aligned_width = ALIGN_POT(width, tile_width);
- unsigned aligned_height = ALIGN_POT(height, tile_height);
-
- unsigned tile_count_x = aligned_width / tile_width;
- unsigned tile_count_y = aligned_height / tile_height;
-
- return tile_count_x * tile_count_y;
-}
-
-/* For `masked_count` of the smallest tile sizes masked out, computes how the
- * size of the polygon list header. We iterate the tile sizes (16x16 through
- * 2048x2048). For each tile size, we figure out how many tiles there are at
- * this hierarchy level and therefore many bytes this level is, leaving us with
- * a byte count for each level. We then just sum up the byte counts across the
- * levels to find a byte count for all levels. */
-
-static unsigned
-panfrost_hierarchy_size(
- unsigned width,
- unsigned height,
- unsigned mask,
- unsigned bytes_per_tile)
-{
- unsigned size = PROLOGUE_SIZE;
-
- /* Iterate hierarchy levels */
-
- for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) {
- /* Check if this level is enabled */
- if (!(mask & (1 << b)))
- continue;
-
- /* Shift from a level to a tile size */
- unsigned tile_size = (1 << b) * MIN_TILE_SIZE;
-
- unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size);
- unsigned level_count = bytes_per_tile * tile_count;
-
- size += level_count;
- }
-
- /* This size will be used as an offset, so ensure it's aligned */
- return ALIGN_POT(size, 0x200);
-}
-
-/* Implement the formula:
- *
- * 0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h)
- *
- * rounding down the answer to the nearest 0x200. This is used to compute both
- * header and body sizes for GPUs without hierarchical tiling. Essentially,
- * computing a single hierarchy level, since there isn't any hierarchy!
- */
-
-static unsigned
-panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile)
-{
- /* First, extract the tile dimensions */
-
- unsigned tw = (1 << (dim & 0b111)) * 8;
- unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8;
-
- /* tile_count is ceil(W/w) * ceil(H/h) */
- unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile;
-
- /* Round down and add offset */
- return 0x200 + ((raw / 0x200) * 0x200);
-}
-
-/* Given a hierarchy mask and a framebuffer size, compute the header size */
-
-unsigned
-panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
-{
- if (hierarchy)
- return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE);
- else
- return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE);
-}
-
-/* The combined header/body is sized similarly (but it is significantly
- * larger), except that it can be empty when the tiler disabled, rather than
- * getting clamped to a minimum size.
- */
-
-unsigned
-panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
-{
- if (hierarchy)
- return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE);
- else
- return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE);
-}
-
-/* On GPUs without hierarchical tiling, we choose a tile size directly and
- * stuff it into the field otherwise known as hierarchy mask (not a mask). */
-
-static unsigned
-panfrost_choose_tile_size(
- unsigned width, unsigned height, unsigned vertex_count)
-{
- /* Figure out the ideal tile size. Eventually a heuristic should be
- * used for this */
-
- unsigned best_w = 16;
- unsigned best_h = 16;
-
- /* Clamp so there are less than 64 tiles in each direction */
-
- best_w = MAX2(best_w, util_next_power_of_two(width / 63));
- best_h = MAX2(best_h, util_next_power_of_two(height / 63));
-
- /* We have our ideal tile size, so encode */
-
- unsigned exp_w = util_logbase2(best_w / 16);
- unsigned exp_h = util_logbase2(best_h / 16);
-
- return exp_w | (exp_h << 6);
-}
-
-/* In the future, a heuristic to choose a tiler hierarchy mask would go here.
- * At the moment, we just default to 0xFF, which enables all possible hierarchy
- * levels. Overall this yields good performance but presumably incurs a cost in
- * memory bandwidth / power consumption / etc, at least on smaller scenes that
- * don't really need all the smaller levels enabled */
-
-unsigned
-panfrost_choose_hierarchy_mask(
- unsigned width, unsigned height,
- unsigned vertex_count, bool hierarchy)
-{
- /* If there is no geometry, we don't bother enabling anything */
-
- if (!vertex_count)
- return 0x00;
-
- if (!hierarchy)
- return panfrost_choose_tile_size(width, height, vertex_count);
-
- /* Otherwise, default everything on. TODO: Proper tests */
-
- return 0xFF;
-}
+++ /dev/null
-/**************************************************************************
- *
- * Copyright 2019 Collabora, Ltd.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef PAN_UTIL_H
-#define PAN_UTIL_H
-
-#define PAN_DBG_MSGS 0x0001
-#define PAN_DBG_TRACE 0x0002
-#define PAN_DBG_DEQP 0x0004
-#define PAN_DBG_AFBC 0x0008
-#define PAN_DBG_SYNC 0x0010
-#define PAN_DBG_PRECOMPILE 0x0020
-#define PAN_DBG_NOFP16 0x0040
-#define PAN_DBG_BIFROST 0x0080
-#define PAN_DBG_GL3 0x0100
-
-#endif /* PAN_UTIL_H */
--- /dev/null
+# Copyright © 2018 Rob Clark
+# Copyright © 2019 Collabora
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libpanfrost_lib_files = files(
+ 'pan_encoder.h',
+
+ 'pan_afbc.c',
+ 'pan_attributes.c',
+ 'pan_bo.c',
+ 'pan_blit.c',
+ 'pan_format.c',
+ 'pan_invocation.c',
+ 'pan_sampler.c',
+ 'pan_tiler.c',
+ 'pan_texture.c',
+ 'pan_scoreboard.c',
+ 'pan_scratch.c',
+ 'pan_pool.c',
+ 'pan_props.c',
+)
+
+libpanfrost_lib = static_library(
+ 'panfrost_lib',
+ [libpanfrost_lib_files],
+ include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_panfrost_hw],
+ c_args : [no_override_init_args],
+ gnu_symbol_visibility : 'hidden',
+ dependencies: [dep_libdrm, idep_nir],
+ build_by_default : false,
+)
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include "pan_texture.h"
+
+/* Arm FrameBuffer Compression (AFBC) is a lossless compression scheme natively
+ * implemented in Mali GPUs (as well as many display controllers paired with
+ * Mali GPUs, etc). Where possible, Panfrost prefers to use AFBC for both
+ * rendering and texturing. In most cases, this is a performance-win due to a
+ * dramatic reduction in memory bandwidth and cache locality compared to a
+ * linear resources.
+ *
+ * AFBC divides the framebuffer into 16x16 tiles (other sizes possible, TODO:
+ * do we need to support this?). So, the width and height each must be aligned
+ * up to 16 pixels. This is inherently good for performance; note that for a 4
+ * byte-per-pixel format like RGBA8888, that means that rows are 16*4=64 byte
+ * aligned, which is the cache-line size.
+ *
+ * For each AFBC-compressed resource, there is a single contiguous
+ * (CPU/GPU-shared) buffer. This buffer itself is divided into two parts:
+ * header and body, placed immediately after each other.
+ *
+ * The AFBC header contains 16 bytes of metadata per tile.
+ *
+ * The AFBC body is the same size as the original linear resource (padded to
+ * the nearest tile). Although the body comes immediately after the header, it
+ * must also be cache-line aligned, so there can sometimes be a bit of padding
+ * between the header and body.
+ *
+ * As an example, a 64x64 RGBA framebuffer contains 64/16 = 4 tiles horizontally and
+ * 4 tiles vertically. There are 4*4=16 tiles in total, each containing 16
+ * bytes of metadata, so there is a 16*16=256 byte header. 64x64 is already
+ * tile aligned, so the body is 64*64 * 4 bytes per pixel = 16384 bytes of
+ * body.
+ *
+ * From userspace, Panfrost needs to be able to calculate these sizes. It
+ * explicitly does not and can not know the format of the data contained within
+ * this header and body. The GPU has native support for AFBC encode/decode. For
+ * an internal FBO or a framebuffer used for scanout with an AFBC-compatible
+ * winsys/display-controller, the buffer is maintained AFBC throughout flight,
+ * and the driver never needs to know the internal data. For edge cases where
+ * the driver really does need to read/write from the AFBC resource, we
+ * generate a linear staging buffer and use the GPU to blit AFBC<--->linear.
+ * TODO: Implement me. */
+
+#define AFBC_TILE_WIDTH 16
+#define AFBC_TILE_HEIGHT 16
+#define AFBC_HEADER_BYTES_PER_TILE 16
+#define AFBC_CACHE_ALIGN 64
+
+/* Is it possible to AFBC compress a particular format? Common formats (and
+ * YUV) are compressible. Some obscure formats are not and fallback on linear,
+ * at a performance hit. Also, if you need to disable AFBC entirely in the
+ * driver for debug/profiling, just always return false here. */
+
+bool
+panfrost_format_supports_afbc(enum pipe_format format)
+{
+ const struct util_format_description *desc =
+ util_format_description(format);
+
+ /* sRGB cannot be AFBC, but it can be tiled. TODO: Verify. The blob
+ * does not do AFBC for SRGB8_ALPHA8, but it's not clear why it
+ * shouldn't be able to. */
+
+ if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+ return false;
+
+ if (util_format_is_rgba8_variant(desc))
+ return true;
+
+ /* Only Z24S8 variants are compressible as Z/S */
+
+ if (panfrost_is_z24s8_variant(format))
+ return true;
+
+ /* Lookup special formats */
+ switch (format) {
+ case PIPE_FORMAT_R8G8B8_UNORM:
+ case PIPE_FORMAT_B8G8R8_UNORM:
+ case PIPE_FORMAT_R5G6B5_UNORM:
+ case PIPE_FORMAT_B5G6R5_UNORM:
+ return true;
+ default:
+ return false;
+ }
+}
+
+unsigned
+panfrost_afbc_header_size(unsigned width, unsigned height)
+{
+ /* Align to tile */
+ unsigned aligned_width = ALIGN_POT(width, AFBC_TILE_WIDTH);
+ unsigned aligned_height = ALIGN_POT(height, AFBC_TILE_HEIGHT);
+
+ /* Compute size in tiles, rather than pixels */
+ unsigned tile_count_x = aligned_width / AFBC_TILE_WIDTH;
+ unsigned tile_count_y = aligned_height / AFBC_TILE_HEIGHT;
+ unsigned tile_count = tile_count_x * tile_count_y;
+
+ /* Multiply to find the header size */
+ unsigned header_bytes = tile_count * AFBC_HEADER_BYTES_PER_TILE;
+
+ /* Align and go */
+ return ALIGN_POT(header_bytes, AFBC_CACHE_ALIGN);
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "util/u_math.h"
+#include "panfrost-job.h"
+#include "pan_encoder.h"
+
+/* This file handles attribute descriptors (mali_attr_meta). The
+ * bulk of the complexity is from instancing. See mali_job for
+ * notes on how this works. But basically, for small vertex
+ * counts, we have a lookup table, and for large vertex counts,
+ * we look at the high bits as a heuristic. This has to match
+ * exactly how the hardware calculates this (which is why the
+ * algorithm is so weird) or else instancing will break. */
+
+/* Given an odd number (of the form 2k + 1), compute k */
+#define ODD(odd) ((odd - 1) >> 1)
+
+static unsigned
+panfrost_small_padded_vertex_count(unsigned idx)
+{
+ if (idx == 11 || idx == 13 || idx == 15 || idx == 19)
+ return idx + 1;
+ else
+ return idx;
+}
+
+static unsigned
+panfrost_large_padded_vertex_count(uint32_t vertex_count)
+{
+ /* First, we have to find the highest set one */
+ unsigned highest = 32 - __builtin_clz(vertex_count);
+
+ /* Using that, we mask out the highest 4-bits */
+ unsigned n = highest - 4;
+ unsigned nibble = (vertex_count >> n) & 0xF;
+
+ /* Great, we have the nibble. Now we can just try possibilities. Note
+ * that we don't care about the bottom most bit in most cases, and we
+ * know the top bit must be 1 */
+
+ unsigned middle_two = (nibble >> 1) & 0x3;
+
+ switch (middle_two) {
+ case 0b00:
+ if (!(nibble & 1))
+ return (1 << n) * 9;
+ else
+ return (1 << (n + 1)) * 5;
+ case 0b01:
+ return (1 << (n + 2)) * 3;
+ case 0b10:
+ return (1 << (n + 1)) * 7;
+ case 0b11:
+ return (1 << (n + 4));
+ default:
+ return 0; /* unreachable */
+ }
+}
+
+unsigned
+panfrost_padded_vertex_count(unsigned vertex_count)
+{
+ if (vertex_count < 20)
+ return panfrost_small_padded_vertex_count(vertex_count);
+ else
+ return panfrost_large_padded_vertex_count(vertex_count);
+}
+
+/* The much, much more irritating case -- instancing is enabled. See
+ * panfrost_job.h for notes on how this works */
+
+static unsigned
+panfrost_compute_magic_divisor(unsigned hw_divisor, unsigned *o_shift, unsigned *extra_flags)
+{
+ /* We have a NPOT divisor. Here's the fun one (multipling by
+ * the inverse and shifting) */
+
+ /* floor(log2(d)) */
+ unsigned shift = util_logbase2(hw_divisor);
+
+ /* m = ceil(2^(32 + shift) / d) */
+ uint64_t shift_hi = 32 + shift;
+ uint64_t t = 1ll << shift_hi;
+ double t_f = t;
+ double hw_divisor_d = hw_divisor;
+ double m_f = ceil(t_f / hw_divisor_d);
+ unsigned m = m_f;
+
+ /* Default case */
+ uint32_t magic_divisor = m;
+
+ /* e = 2^(shift + 32) % d */
+ uint64_t e = t % hw_divisor;
+
+ /* Apply round-down algorithm? e <= 2^shift?. XXX: The blob
+ * seems to use a different condition */
+ if (e <= (1ll << shift)) {
+ magic_divisor = m - 1;
+ *extra_flags = 1;
+ }
+
+ /* Top flag implicitly set */
+ assert(magic_divisor & (1u << 31));
+ magic_divisor &= ~(1u << 31);
+ *o_shift = shift;
+
+ return magic_divisor;
+}
+
+unsigned
+panfrost_vertex_instanced(
+ unsigned padded_count,
+ unsigned instance_shift, unsigned instance_odd,
+ unsigned divisor,
+ union mali_attr *attrs)
+{
+ /* Depending if there is an instance divisor or not, packing varies.
+ * When there is a divisor, the hardware-level divisor is actually the
+ * product of the instance divisor and the padded count */
+
+ unsigned hw_divisor = padded_count * divisor;
+
+ if (divisor == 0) {
+ /* Per-vertex attributes use the MODULO mode. First, compute
+ * the modulus */
+
+ attrs->elements |= MALI_ATTR_MODULO;
+ attrs->shift = instance_shift;
+ attrs->extra_flags = instance_odd;
+
+ return 1;
+ } else if (util_is_power_of_two_or_zero(hw_divisor)) {
+ /* If there is a divisor but the hardware divisor works out to
+ * a power of two (not terribly exceptional), we can use an
+ * easy path (just shifting) */
+
+ attrs->elements |= MALI_ATTR_POT_DIVIDE;
+ attrs->shift = __builtin_ctz(hw_divisor);
+
+ return 1;
+ } else {
+ unsigned shift = 0, extra_flags = 0;
+
+ attrs[1].magic_divisor =
+ panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
+
+ /* Upload to two different slots */
+
+ attrs[0].elements |= MALI_ATTR_NPOT_DIVIDE;
+ attrs[0].shift = shift;
+ attrs[0].extra_flags = extra_flags;
+
+ attrs[1].unk = 0x20;
+ attrs[1].zero = 0;
+ attrs[1].divisor = divisor;
+
+ return 2;
+ }
+}
+
+/* Records for gl_VertexID and gl_InstanceID use a slightly special encoding,
+ * but the idea is the same */
+
+void
+panfrost_vertex_id(
+ unsigned padded_count,
+ union mali_attr *attr)
+{
+ /* We factor the padded count as shift/odd and that's it */
+
+ attr->elements = MALI_ATTR_VERTEXID;
+ attr->shift = __builtin_ctz(padded_count);
+ attr->extra_flags = padded_count >> (attr->shift + 1);
+ attr->stride = attr->size = 0;
+}
+
+void
+panfrost_instance_id(
+ unsigned padded_count,
+ union mali_attr *attr)
+{
+ attr->elements = MALI_ATTR_INSTANCEID;
+ attr->stride = 0;
+ attr->extra_flags = 0;
+ attr->size = 0;
+
+ /* POT records have just a shift directly with an off-by-one for
+ * unclear reasons. NPOT records have a magic divisor smushed into the
+ * stride field (which is unused for these special records) */
+
+ if (util_is_power_of_two_or_zero(padded_count)) {
+ attr->shift = __builtin_ctz(padded_count) - 1;
+ } else {
+ unsigned shift = 0, flags = 0;
+
+ attr->stride = panfrost_compute_magic_divisor(padded_count, &shift, &flags);
+ attr->shift = shift;
+ attr->extra_flags = flags;
+ }
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2020 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include "pan_encoder.h"
+#include "pan_pool.h"
+#include "pan_scoreboard.h"
+#include "pan_texture.h"
+#include "panfrost-quirks.h"
+#include "../midgard/midgard_compile.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/u_math.h"
+
+/* On Midgard, the native blit infrastructure (via MFBD preloads) is broken or
+ * missing in many cases. We instead use software paths as fallbacks to
+ * implement blits, which are done as TILER jobs. No vertex shader is
+ * necessary since we can supply screen-space coordinates directly.
+ *
+ * This is primarily designed as a fallback for preloads but could be extended
+ * for other clears/blits if needed in the future. */
+
+static void
+panfrost_build_blit_shader(panfrost_program *program, unsigned gpu_id, gl_frag_result loc, nir_alu_type T, bool ms)
+{
+ bool is_colour = loc >= FRAG_RESULT_DATA0;
+
+ nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_FRAGMENT, &midgard_nir_options, NULL);
+ nir_function *fn = nir_function_create(shader, "main");
+ nir_function_impl *impl = nir_function_impl_create(fn);
+
+ nir_variable *c_src = nir_variable_create(shader, nir_var_shader_in, glsl_vector_type(GLSL_TYPE_FLOAT, 2), "coord");
+ nir_variable *c_out = nir_variable_create(shader, nir_var_shader_out, glsl_vector_type(
+ GLSL_TYPE_FLOAT, is_colour ? 4 : 1), "out");
+
+ c_src->data.location = VARYING_SLOT_TEX0;
+ c_out->data.location = loc;
+
+ nir_builder _b;
+ nir_builder *b = &_b;
+ nir_builder_init(b, impl);
+ b->cursor = nir_before_block(nir_start_block(impl));
+
+ nir_ssa_def *coord = nir_load_var(b, c_src);
+
+ nir_tex_instr *tex = nir_tex_instr_create(shader, ms ? 3 : 1);
+
+ tex->dest_type = T;
+
+ if (ms) {
+ tex->src[0].src_type = nir_tex_src_coord;
+ tex->src[0].src = nir_src_for_ssa(nir_f2i32(b, coord));
+ tex->coord_components = 2;
+
+ tex->src[1].src_type = nir_tex_src_ms_index;
+ tex->src[1].src = nir_src_for_ssa(nir_load_sample_id(b));
+
+ tex->src[2].src_type = nir_tex_src_lod;
+ tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0));
+ tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
+ tex->op = nir_texop_txf_ms;
+ } else {
+ tex->op = nir_texop_tex;
+
+ tex->src[0].src_type = nir_tex_src_coord;
+ tex->src[0].src = nir_src_for_ssa(coord);
+ tex->coord_components = 2;
+
+ tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
+ }
+
+ nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
+ nir_builder_instr_insert(b, &tex->instr);
+
+ if (is_colour)
+ nir_store_var(b, c_out, &tex->dest.ssa, 0xFF);
+ else
+ nir_store_var(b, c_out, nir_channel(b, &tex->dest.ssa, 0), 0xFF);
+
+ midgard_compile_shader_nir(shader, program, false, 0, gpu_id, false, true);
+}
+
+/* Compile and upload all possible blit shaders ahead-of-time to reduce draw
+ * time overhead. There's only ~30 of them at the moment, so this is fine */
+
+void
+panfrost_init_blit_shaders(struct panfrost_device *dev)
+{
+ static const struct {
+ gl_frag_result loc;
+ unsigned types;
+ } shader_descs[] = {
+ { FRAG_RESULT_DEPTH, 1 << PAN_BLIT_FLOAT },
+ { FRAG_RESULT_STENCIL, 1 << PAN_BLIT_UINT },
+ { FRAG_RESULT_DATA0, ~0 },
+ { FRAG_RESULT_DATA1, ~0 },
+ { FRAG_RESULT_DATA2, ~0 },
+ { FRAG_RESULT_DATA3, ~0 },
+ { FRAG_RESULT_DATA4, ~0 },
+ { FRAG_RESULT_DATA5, ~0 },
+ { FRAG_RESULT_DATA6, ~0 },
+ { FRAG_RESULT_DATA7, ~0 }
+ };
+
+ nir_alu_type nir_types[PAN_BLIT_NUM_TYPES] = {
+ nir_type_float,
+ nir_type_uint,
+ nir_type_int
+ };
+
+ /* Total size = # of shaders * bytes per shader. There are
+ * shaders for each RT (so up to DATA7 -- overestimate is
+ * okay) and up to NUM_TYPES variants of each, * 2 for multisampling
+ * variants. These shaders are simple enough that they should be less
+ * than 8 quadwords each (again, overestimate is fine). */
+
+ unsigned offset = 0;
+ unsigned total_size = (FRAG_RESULT_DATA7 * PAN_BLIT_NUM_TYPES)
+ * (8 * 16) * 2;
+
+ dev->blit_shaders.bo = panfrost_bo_create(dev, total_size, PAN_BO_EXECUTE);
+
+ /* Don't bother generating multisampling variants if we don't actually
+ * support multisampling */
+ bool has_ms = !(dev->quirks & MIDGARD_SFBD);
+
+ for (unsigned ms = 0; ms <= has_ms; ++ms) {
+ for (unsigned i = 0; i < ARRAY_SIZE(shader_descs); ++i) {
+ unsigned loc = shader_descs[i].loc;
+
+ for (enum pan_blit_type T = 0; T < PAN_BLIT_NUM_TYPES; ++T) {
+ if (!(shader_descs[i].types & (1 << T)))
+ continue;
+
+ panfrost_program program;
+ panfrost_build_blit_shader(&program, dev->gpu_id, loc,
+ nir_types[T], ms);
+
+ assert(offset + program.compiled.size < total_size);
+ memcpy(dev->blit_shaders.bo->cpu + offset, program.compiled.data, program.compiled.size);
+
+ dev->blit_shaders.loads[loc][T][ms] = (dev->blit_shaders.bo->gpu + offset) | program.first_tag;
+ offset += ALIGN_POT(program.compiled.size, 64);
+ util_dynarray_fini(&program.compiled);
+ }
+ }
+ }
+}
+
+/* Add a shader-based load on Midgard (draw-time for GL). Shaders are
+ * precached */
+
+void
+panfrost_load_midg(
+ struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ mali_ptr blend_shader,
+ mali_ptr fbd,
+ mali_ptr coordinates, unsigned vertex_count,
+ struct pan_image *image,
+ unsigned loc)
+{
+ unsigned width = u_minify(image->width0, image->first_level);
+ unsigned height = u_minify(image->height0, image->first_level);
+
+ struct mali_viewport viewport = {
+ .clip_minx = -INFINITY,
+ .clip_miny = -INFINITY,
+ .clip_maxx = INFINITY,
+ .clip_maxy = INFINITY,
+ .clip_minz = 0.0,
+ .clip_maxz = 1.0,
+
+ .viewport0 = { 0, 0 },
+ .viewport1 = { MALI_POSITIVE(width), MALI_POSITIVE(height) }
+ };
+
+ union mali_attr varying = {
+ .elements = coordinates | MALI_ATTR_LINEAR,
+ .stride = 4 * sizeof(float),
+ .size = 4 * sizeof(float) * vertex_count,
+ };
+
+ struct mali_attr_meta varying_meta = {
+ .index = 0,
+ .unknown1 = 2,
+ .swizzle = (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3),
+ .format = MALI_RGBA32F
+ };
+
+ struct mali_stencil_test stencil = {
+ .mask = 0xFF,
+ .func = MALI_FUNC_ALWAYS,
+ .sfail = MALI_STENCIL_REPLACE,
+ .dpfail = MALI_STENCIL_REPLACE,
+ .dppass = MALI_STENCIL_REPLACE,
+ };
+
+ union midgard_blend replace = {
+ .equation = {
+ .rgb_mode = 0x122,
+ .alpha_mode = 0x122,
+ .color_mask = MALI_MASK_R | MALI_MASK_G | MALI_MASK_B | MALI_MASK_A,
+ }
+ };
+
+ if (blend_shader)
+ replace.shader = blend_shader;
+
+ /* Determine the sampler type needed. Stencil is always sampled as
+ * UINT. Pure (U)INT is always (U)INT. Everything else is FLOAT. */
+
+ enum pan_blit_type T =
+ (loc == FRAG_RESULT_STENCIL) ? PAN_BLIT_UINT :
+ (util_format_is_pure_uint(image->format)) ? PAN_BLIT_UINT :
+ (util_format_is_pure_sint(image->format)) ? PAN_BLIT_INT :
+ PAN_BLIT_FLOAT;
+
+ bool ms = image->nr_samples > 1;
+
+ struct mali_shader_meta shader_meta = {
+ .shader = pool->dev->blit_shaders.loads[loc][T][ms],
+ .sampler_count = 1,
+ .texture_count = 1,
+ .varying_count = 1,
+ .midgard1 = {
+ .flags_lo = 0x20,
+ .work_count = 4,
+ },
+ .coverage_mask = 0xF,
+ .unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10,
+ .unknown2_4 = 0x4e0,
+ .stencil_mask_front = ~0,
+ .stencil_mask_back = ~0,
+ .stencil_front = stencil,
+ .stencil_back = stencil,
+ .blend = {
+ .shader = blend_shader
+ }
+ };
+
+ if (ms)
+ shader_meta.unknown2_3 |= MALI_HAS_MSAA | MALI_PER_SAMPLE;
+ else
+ shader_meta.unknown2_4 |= MALI_NO_MSAA;
+
+ assert(shader_meta.shader);
+
+ if (pool->dev->quirks & MIDGARD_SFBD) {
+ shader_meta.unknown2_4 |= (0x10 | MALI_NO_DITHER);
+ shader_meta.blend = replace;
+
+ if (loc < FRAG_RESULT_DATA0)
+ shader_meta.blend.equation.color_mask = 0x0;
+ }
+
+ if (loc == FRAG_RESULT_DEPTH) {
+ shader_meta.midgard1.flags_lo |= MALI_WRITES_Z;
+ shader_meta.unknown2_3 |= MALI_DEPTH_WRITEMASK;
+ } else if (loc == FRAG_RESULT_STENCIL) {
+ shader_meta.midgard1.flags_hi |= MALI_WRITES_S;
+ shader_meta.unknown2_4 |= MALI_STENCIL_TEST;
+ } else {
+ shader_meta.midgard1.flags_lo |= MALI_EARLY_Z;
+ }
+
+ /* Create the texture descriptor. We partially compute the base address
+ * ourselves to account for layer, such that the texture descriptor
+ * itself is for a 2D texture with array size 1 even for 3D/array
+ * textures, removing the need to separately key the blit shaders for
+ * 2D and 3D variants */
+
+ struct panfrost_transfer texture_t = panfrost_pool_alloc(pool, sizeof(struct mali_texture_descriptor) + sizeof(mali_ptr) * 2 * MAX2(image->nr_samples, 1));
+
+ panfrost_new_texture(texture_t.cpu,
+ image->width0, image->height0,
+ MAX2(image->nr_samples, 1), 1,
+ image->format, MALI_TEX_2D,
+ image->layout,
+ image->first_level, image->last_level,
+ 0, 0,
+ image->nr_samples,
+ 0,
+ (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) | (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9),
+ image->bo->gpu + image->first_layer *
+ panfrost_get_layer_stride(image->slices,
+ image->type == MALI_TEX_3D,
+ image->cubemap_stride, image->first_level),
+ image->slices);
+
+ struct mali_sampler_descriptor sampler = {
+ .filter_mode = MALI_SAMP_MAG_NEAREST | MALI_SAMP_MIN_NEAREST,
+ .wrap_s = MALI_WRAP_CLAMP_TO_EDGE,
+ .wrap_t = MALI_WRAP_CLAMP_TO_EDGE,
+ .wrap_r = MALI_WRAP_CLAMP_TO_EDGE,
+ };
+
+ struct panfrost_transfer shader_meta_t = panfrost_pool_alloc(pool, sizeof(shader_meta) + 8 * sizeof(struct midgard_blend_rt));
+ memcpy(shader_meta_t.cpu, &shader_meta, sizeof(shader_meta));
+
+ for (unsigned i = 0; i < 8; ++i) {
+ void *dest = shader_meta_t.cpu + sizeof(shader_meta) + sizeof(struct midgard_blend_rt) * i;
+
+ if (loc == (FRAG_RESULT_DATA0 + i)) {
+ struct midgard_blend_rt blend_rt = {
+ .flags = 0x200 | MALI_BLEND_NO_DITHER,
+ .blend = replace,
+ };
+
+ if (util_format_is_srgb(image->format))
+ blend_rt.flags |= MALI_BLEND_SRGB;
+
+ if (blend_shader) {
+ blend_rt.flags |= MALI_BLEND_MRT_SHADER;
+ blend_rt.blend.shader = blend_shader;
+ }
+
+ memcpy(dest, &blend_rt, sizeof(struct midgard_blend_rt));
+ } else {
+ memset(dest, 0x0, sizeof(struct midgard_blend_rt));
+ }
+ }
+
+ struct midgard_payload_vertex_tiler payload = {
+ .prefix = {
+ .draw_mode = MALI_TRIANGLES,
+ .unknown_draw = 0x3000,
+ .index_count = MALI_POSITIVE(vertex_count)
+ },
+ .postfix = {
+ .gl_enables = 0x7,
+ .position_varying = coordinates,
+ .textures = panfrost_pool_upload(pool, &texture_t.gpu, sizeof(texture_t.gpu)),
+ .sampler_descriptor = panfrost_pool_upload(pool, &sampler, sizeof(sampler)),
+ .shader = shader_meta_t.gpu,
+ .varyings = panfrost_pool_upload(pool, &varying, sizeof(varying)),
+ .varying_meta = panfrost_pool_upload(pool, &varying_meta, sizeof(varying_meta)),
+ .viewport = panfrost_pool_upload(pool, &viewport, sizeof(viewport)),
+ .shared_memory = fbd
+ }
+ };
+
+ panfrost_pack_work_groups_compute(&payload.prefix, 1, vertex_count, 1, 1, 1, 1, true);
+ payload.prefix.workgroups_x_shift_3 = 6;
+
+ panfrost_new_job(pool, scoreboard, JOB_TYPE_TILER, false, 0, &payload, sizeof(payload), true);
+}
--- /dev/null
+/*
+ * Copyright 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors (Collabora):
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <xf86drm.h>
+#include <pthread.h>
+#include "drm-uapi/panfrost_drm.h"
+
+#include "pan_bo.h"
+#include "pan_util.h"
+#include "wrap.h"
+
+#include "os/os_mman.h"
+
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+
+/* This file implements a userspace BO cache. Allocating and freeing
+ * GPU-visible buffers is very expensive, and even the extra kernel roundtrips
+ * adds more work than we would like at this point. So caching BOs in userspace
+ * solves both of these problems and does not require kernel updates.
+ *
+ * Cached BOs are sorted into a bucket based on rounding their size down to the
+ * nearest power-of-two. Each bucket contains a linked list of free panfrost_bo
+ * objects. Putting a BO into the cache is accomplished by adding it to the
+ * corresponding bucket. Getting a BO from the cache consists of finding the
+ * appropriate bucket and sorting. A cache eviction is a kernel-level free of a
+ * BO and removing it from the bucket. We special case evicting all BOs from
+ * the cache, since that's what helpful in practice and avoids extra logic
+ * around the linked list.
+ */
+
+static struct panfrost_bo *
+panfrost_bo_alloc(struct panfrost_device *dev, size_t size,
+ uint32_t flags)
+{
+ struct drm_panfrost_create_bo create_bo = { .size = size };
+ struct panfrost_bo *bo;
+ int ret;
+
+ if (dev->kernel_version->version_major > 1 ||
+ dev->kernel_version->version_minor >= 1) {
+ if (flags & PAN_BO_GROWABLE)
+ create_bo.flags |= PANFROST_BO_HEAP;
+ if (!(flags & PAN_BO_EXECUTE))
+ create_bo.flags |= PANFROST_BO_NOEXEC;
+ }
+
+ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_CREATE_BO, &create_bo);
+ if (ret) {
+ fprintf(stderr, "DRM_IOCTL_PANFROST_CREATE_BO failed: %m\n");
+ return NULL;
+ }
+
+ bo = pan_lookup_bo(dev, create_bo.handle);
+ assert(!memcmp(bo, &((struct panfrost_bo){}), sizeof(*bo)));
+
+ bo->size = create_bo.size;
+ bo->gpu = create_bo.offset;
+ bo->gem_handle = create_bo.handle;
+ bo->flags = flags;
+ bo->dev = dev;
+ return bo;
+}
+
+static void
+panfrost_bo_free(struct panfrost_bo *bo)
+{
+ struct drm_gem_close gem_close = { .handle = bo->gem_handle };
+ int ret;
+
+ ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+ if (ret) {
+ fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %m\n");
+ assert(0);
+ }
+
+ /* BO will be freed with the sparse array, but zero to indicate free */
+ memset(bo, 0, sizeof(*bo));
+}
+
+/* Returns true if the BO is ready, false otherwise.
+ * access_type is encoding the type of access one wants to ensure is done.
+ * Waiting is always done for writers, but if wait_readers is set then readers
+ * are also waited for.
+ */
+bool
+panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers)
+{
+ struct drm_panfrost_wait_bo req = {
+ .handle = bo->gem_handle,
+ .timeout_ns = timeout_ns,
+ };
+ int ret;
+
+ /* If the BO has been exported or imported we can't rely on the cached
+ * state, we need to call the WAIT_BO ioctl.
+ */
+ if (!(bo->flags & PAN_BO_SHARED)) {
+ /* If ->gpu_access is 0, the BO is idle, no need to wait. */
+ if (!bo->gpu_access)
+ return true;
+
+ /* If the caller only wants to wait for writers and no
+ * writes are pending, we don't have to wait.
+ */
+ if (!wait_readers && !(bo->gpu_access & PAN_BO_ACCESS_WRITE))
+ return true;
+ }
+
+ /* The ioctl returns >= 0 value when the BO we are waiting for is ready
+ * -1 otherwise.
+ */
+ ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_WAIT_BO, &req);
+ if (ret != -1) {
+ /* Set gpu_access to 0 so that the next call to bo_wait()
+ * doesn't have to call the WAIT_BO ioctl.
+ */
+ bo->gpu_access = 0;
+ return true;
+ }
+
+ /* If errno is not ETIMEDOUT or EBUSY that means the handle we passed
+ * is invalid, which shouldn't happen here.
+ */
+ assert(errno == ETIMEDOUT || errno == EBUSY);
+ return false;
+}
+
+/* Helper to calculate the bucket index of a BO */
+
+static unsigned
+pan_bucket_index(unsigned size)
+{
+ /* Round down to POT to compute a bucket index */
+
+ unsigned bucket_index = util_logbase2(size);
+
+ /* Clamp the bucket index; all huge allocations will be
+ * sorted into the largest bucket */
+
+ bucket_index = MIN2(bucket_index, MAX_BO_CACHE_BUCKET);
+
+ /* The minimum bucket size must equal the minimum allocation
+ * size; the maximum we clamped */
+
+ assert(bucket_index >= MIN_BO_CACHE_BUCKET);
+ assert(bucket_index <= MAX_BO_CACHE_BUCKET);
+
+ /* Reindex from 0 */
+ return (bucket_index - MIN_BO_CACHE_BUCKET);
+}
+
+static struct list_head *
+pan_bucket(struct panfrost_device *dev, unsigned size)
+{
+ return &dev->bo_cache.buckets[pan_bucket_index(size)];
+}
+
+/* Tries to fetch a BO of sufficient size with the appropriate flags from the
+ * BO cache. If it succeeds, it returns that BO and removes the BO from the
+ * cache. If it fails, it returns NULL signaling the caller to allocate a new
+ * BO. */
+
+static struct panfrost_bo *
+panfrost_bo_cache_fetch(struct panfrost_device *dev,
+ size_t size, uint32_t flags, bool dontwait)
+{
+ pthread_mutex_lock(&dev->bo_cache.lock);
+ struct list_head *bucket = pan_bucket(dev, size);
+ struct panfrost_bo *bo = NULL;
+
+ /* Iterate the bucket looking for something suitable */
+ list_for_each_entry_safe(struct panfrost_bo, entry, bucket,
+ bucket_link) {
+ if (entry->size < size || entry->flags != flags)
+ continue;
+
+ if (!panfrost_bo_wait(entry, dontwait ? 0 : INT64_MAX,
+ PAN_BO_ACCESS_RW))
+ continue;
+
+ struct drm_panfrost_madvise madv = {
+ .handle = entry->gem_handle,
+ .madv = PANFROST_MADV_WILLNEED,
+ };
+ int ret;
+
+ /* This one works, splice it out of the cache */
+ list_del(&entry->bucket_link);
+ list_del(&entry->lru_link);
+
+ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
+ if (!ret && !madv.retained) {
+ panfrost_bo_free(entry);
+ continue;
+ }
+ /* Let's go! */
+ bo = entry;
+ break;
+ }
+ pthread_mutex_unlock(&dev->bo_cache.lock);
+
+ return bo;
+}
+
+static void
+panfrost_bo_cache_evict_stale_bos(struct panfrost_device *dev)
+{
+ struct timespec time;
+
+ clock_gettime(CLOCK_MONOTONIC, &time);
+ list_for_each_entry_safe(struct panfrost_bo, entry,
+ &dev->bo_cache.lru, lru_link) {
+ /* We want all entries that have been used more than 1 sec
+ * ago to be dropped, others can be kept.
+ * Note the <= 2 check and not <= 1. It's here to account for
+ * the fact that we're only testing ->tv_sec, not ->tv_nsec.
+ * That means we might keep entries that are between 1 and 2
+ * seconds old, but we don't really care, as long as unused BOs
+ * are dropped at some point.
+ */
+ if (time.tv_sec - entry->last_used <= 2)
+ break;
+
+ list_del(&entry->bucket_link);
+ list_del(&entry->lru_link);
+ panfrost_bo_free(entry);
+ }
+}
+
+/* Tries to add a BO to the cache. Returns if it was
+ * successful */
+
+static bool
+panfrost_bo_cache_put(struct panfrost_bo *bo)
+{
+ struct panfrost_device *dev = bo->dev;
+
+ if (bo->flags & PAN_BO_SHARED)
+ return false;
+
+ pthread_mutex_lock(&dev->bo_cache.lock);
+ struct list_head *bucket = pan_bucket(dev, MAX2(bo->size, 4096));
+ struct drm_panfrost_madvise madv;
+ struct timespec time;
+
+ madv.handle = bo->gem_handle;
+ madv.madv = PANFROST_MADV_DONTNEED;
+ madv.retained = 0;
+
+ drmIoctl(dev->fd, DRM_IOCTL_PANFROST_MADVISE, &madv);
+
+ /* Add us to the bucket */
+ list_addtail(&bo->bucket_link, bucket);
+
+ /* Add us to the LRU list and update the last_used field. */
+ list_addtail(&bo->lru_link, &dev->bo_cache.lru);
+ clock_gettime(CLOCK_MONOTONIC, &time);
+ bo->last_used = time.tv_sec;
+
+ /* Let's do some cleanup in the BO cache while we hold the
+ * lock.
+ */
+ panfrost_bo_cache_evict_stale_bos(dev);
+ pthread_mutex_unlock(&dev->bo_cache.lock);
+
+ return true;
+}
+
+/* Evicts all BOs from the cache. Called during context
+ * destroy or during low-memory situations (to free up
+ * memory that may be unused by us just sitting in our
+ * cache, but still reserved from the perspective of the
+ * OS) */
+
+void
+panfrost_bo_cache_evict_all(
+ struct panfrost_device *dev)
+{
+ pthread_mutex_lock(&dev->bo_cache.lock);
+ for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i) {
+ struct list_head *bucket = &dev->bo_cache.buckets[i];
+
+ list_for_each_entry_safe(struct panfrost_bo, entry, bucket,
+ bucket_link) {
+ list_del(&entry->bucket_link);
+ list_del(&entry->lru_link);
+ panfrost_bo_free(entry);
+ }
+ }
+ pthread_mutex_unlock(&dev->bo_cache.lock);
+}
+
+void
+panfrost_bo_mmap(struct panfrost_bo *bo)
+{
+ struct drm_panfrost_mmap_bo mmap_bo = { .handle = bo->gem_handle };
+ int ret;
+
+ if (bo->cpu)
+ return;
+
+ ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PANFROST_MMAP_BO, &mmap_bo);
+ if (ret) {
+ fprintf(stderr, "DRM_IOCTL_PANFROST_MMAP_BO failed: %m\n");
+ assert(0);
+ }
+
+ bo->cpu = os_mmap(NULL, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ bo->dev->fd, mmap_bo.offset);
+ if (bo->cpu == MAP_FAILED) {
+ fprintf(stderr, "mmap failed: %p %m\n", bo->cpu);
+ assert(0);
+ }
+}
+
+static void
+panfrost_bo_munmap(struct panfrost_bo *bo)
+{
+ if (!bo->cpu)
+ return;
+
+ if (os_munmap((void *) (uintptr_t)bo->cpu, bo->size)) {
+ perror("munmap");
+ abort();
+ }
+
+ bo->cpu = NULL;
+}
+
+struct panfrost_bo *
+panfrost_bo_create(struct panfrost_device *dev, size_t size,
+ uint32_t flags)
+{
+ struct panfrost_bo *bo;
+
+ /* Kernel will fail (confusingly) with EPERM otherwise */
+ assert(size > 0);
+
+ /* To maximize BO cache usage, don't allocate tiny BOs */
+ size = MAX2(size, 4096);
+
+ /* GROWABLE BOs cannot be mmapped */
+ if (flags & PAN_BO_GROWABLE)
+ assert(flags & PAN_BO_INVISIBLE);
+
+ /* Before creating a BO, we first want to check the cache but without
+ * waiting for BO readiness (BOs in the cache can still be referenced
+ * by jobs that are not finished yet).
+ * If the cached allocation fails we fall back on fresh BO allocation,
+ * and if that fails too, we try one more time to allocate from the
+ * cache, but this time we accept to wait.
+ */
+ bo = panfrost_bo_cache_fetch(dev, size, flags, true);
+ if (!bo)
+ bo = panfrost_bo_alloc(dev, size, flags);
+ if (!bo)
+ bo = panfrost_bo_cache_fetch(dev, size, flags, false);
+
+ if (!bo)
+ fprintf(stderr, "BO creation failed\n");
+
+ assert(bo);
+
+ /* Only mmap now if we know we need to. For CPU-invisible buffers, we
+ * never map since we don't care about their contents; they're purely
+ * for GPU-internal use. But we do trace them anyway. */
+
+ if (!(flags & (PAN_BO_INVISIBLE | PAN_BO_DELAY_MMAP)))
+ panfrost_bo_mmap(bo);
+
+ p_atomic_set(&bo->refcnt, 1);
+
+ if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
+ if (flags & PAN_BO_INVISIBLE)
+ pandecode_inject_mmap(bo->gpu, NULL, bo->size, NULL);
+ else if (!(flags & PAN_BO_DELAY_MMAP))
+ pandecode_inject_mmap(bo->gpu, bo->cpu, bo->size, NULL);
+ }
+
+ return bo;
+}
+
+void
+panfrost_bo_reference(struct panfrost_bo *bo)
+{
+ if (bo) {
+ ASSERTED int count = p_atomic_inc_return(&bo->refcnt);
+ assert(count != 1);
+ }
+}
+
+void
+panfrost_bo_unreference(struct panfrost_bo *bo)
+{
+ if (!bo)
+ return;
+
+ /* Don't return to cache if there are still references */
+ if (p_atomic_dec_return(&bo->refcnt))
+ return;
+
+ struct panfrost_device *dev = bo->dev;
+
+ pthread_mutex_lock(&dev->bo_map_lock);
+
+ /* Someone might have imported this BO while we were waiting for the
+ * lock, let's make sure it's still not referenced before freeing it.
+ */
+ if (p_atomic_read(&bo->refcnt) == 0) {
+ /* When the reference count goes to zero, we need to cleanup */
+ panfrost_bo_munmap(bo);
+
+ /* Rather than freeing the BO now, we'll cache the BO for later
+ * allocations if we're allowed to.
+ */
+ if (!panfrost_bo_cache_put(bo))
+ panfrost_bo_free(bo);
+
+ }
+ pthread_mutex_unlock(&dev->bo_map_lock);
+}
+
+struct panfrost_bo *
+panfrost_bo_import(struct panfrost_device *dev, int fd)
+{
+ struct panfrost_bo *bo;
+ struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
+ ASSERTED int ret;
+ unsigned gem_handle;
+
+ ret = drmPrimeFDToHandle(dev->fd, fd, &gem_handle);
+ assert(!ret);
+
+ pthread_mutex_lock(&dev->bo_map_lock);
+ bo = pan_lookup_bo(dev, gem_handle);
+
+ if (!bo->dev) {
+ get_bo_offset.handle = gem_handle;
+ ret = drmIoctl(dev->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
+ assert(!ret);
+
+ bo->dev = dev;
+ bo->gpu = (mali_ptr) get_bo_offset.offset;
+ bo->size = lseek(fd, 0, SEEK_END);
+ bo->flags = PAN_BO_SHARED;
+ bo->gem_handle = gem_handle;
+ assert(bo->size > 0);
+ p_atomic_set(&bo->refcnt, 1);
+ // TODO map and unmap on demand?
+ panfrost_bo_mmap(bo);
+ } else {
+ /* bo->refcnt == 0 can happen if the BO
+ * was being released but panfrost_bo_import() acquired the
+ * lock before panfrost_bo_unreference(). In that case, refcnt
+ * is 0 and we can't use panfrost_bo_reference() directly, we
+ * have to re-initialize the refcnt().
+ * Note that panfrost_bo_unreference() checks
+ * refcnt value just after acquiring the lock to
+ * make sure the object is not freed if panfrost_bo_import()
+ * acquired it in the meantime.
+ */
+ if (p_atomic_read(&bo->refcnt) == 0)
+ p_atomic_set(&bo->refcnt, 1);
+ else
+ panfrost_bo_reference(bo);
+ assert(bo->cpu);
+ }
+ pthread_mutex_unlock(&dev->bo_map_lock);
+
+ return bo;
+}
+
+int
+panfrost_bo_export(struct panfrost_bo *bo)
+{
+ struct drm_prime_handle args = {
+ .handle = bo->gem_handle,
+ .flags = DRM_CLOEXEC,
+ };
+
+ int ret = drmIoctl(bo->dev->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
+ if (ret == -1)
+ return -1;
+
+ bo->flags |= PAN_BO_SHARED;
+ return args.fd;
+}
+
--- /dev/null
+/*
+ * © Copyright 2019 Alyssa Rosenzweig
+ * © Copyright 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PAN_BO_H__
+#define __PAN_BO_H__
+
+#include "util/list.h"
+#include "pan_device.h"
+#include <time.h>
+
+/* Flags for allocated memory */
+
+/* This memory region is executable */
+#define PAN_BO_EXECUTE (1 << 0)
+
+/* This memory region should be lazily allocated and grow-on-page-fault. Must
+ * be used in conjunction with INVISIBLE */
+#define PAN_BO_GROWABLE (1 << 1)
+
+/* This memory region should not be mapped to the CPU */
+#define PAN_BO_INVISIBLE (1 << 2)
+
+/* This region may not be used immediately and will not mmap on allocate
+ * (semantically distinct from INVISIBLE, which cannot never be mmaped) */
+#define PAN_BO_DELAY_MMAP (1 << 3)
+
+/* BO is shared across processes (imported or exported) and therefore cannot be
+ * cached locally */
+#define PAN_BO_SHARED (1 << 4)
+
+/* GPU access flags */
+
+/* BO is either shared (can be accessed by more than one GPU batch) or private
+ * (reserved by a specific GPU job). */
+#define PAN_BO_ACCESS_PRIVATE (0 << 0)
+#define PAN_BO_ACCESS_SHARED (1 << 0)
+
+/* BO is being read/written by the GPU */
+#define PAN_BO_ACCESS_READ (1 << 1)
+#define PAN_BO_ACCESS_WRITE (1 << 2)
+#define PAN_BO_ACCESS_RW (PAN_BO_ACCESS_READ | PAN_BO_ACCESS_WRITE)
+
+/* BO is accessed by the vertex/tiler job. */
+#define PAN_BO_ACCESS_VERTEX_TILER (1 << 3)
+
+/* BO is accessed by the fragment job. */
+#define PAN_BO_ACCESS_FRAGMENT (1 << 4)
+
+struct panfrost_bo {
+ /* Must be first for casting */
+ struct list_head bucket_link;
+
+ /* Used to link the BO to the BO cache LRU list. */
+ struct list_head lru_link;
+
+ /* Store the time this BO was use last, so the BO cache logic can evict
+ * stale BOs.
+ */
+ time_t last_used;
+
+ /* Atomic reference count */
+ int32_t refcnt;
+
+ struct panfrost_device *dev;
+
+ /* Mapping for the entire object (all levels) */
+ uint8_t *cpu;
+
+ /* GPU address for the object */
+ mali_ptr gpu;
+
+ /* Size of all entire trees */
+ size_t size;
+
+ int gem_handle;
+
+ uint32_t flags;
+
+ /* Combination of PAN_BO_ACCESS_{READ,WRITE} flags encoding pending
+ * GPU accesses to this BO. Useful to avoid calling the WAIT_BO ioctl
+ * when the BO is idle.
+ */
+ uint32_t gpu_access;
+};
+
+bool
+panfrost_bo_wait(struct panfrost_bo *bo, int64_t timeout_ns, bool wait_readers);
+void
+panfrost_bo_reference(struct panfrost_bo *bo);
+void
+panfrost_bo_unreference(struct panfrost_bo *bo);
+struct panfrost_bo *
+panfrost_bo_create(struct panfrost_device *dev, size_t size,
+ uint32_t flags);
+void
+panfrost_bo_mmap(struct panfrost_bo *bo);
+struct panfrost_bo *
+panfrost_bo_import(struct panfrost_device *dev, int fd);
+int
+panfrost_bo_export(struct panfrost_bo *bo);
+void
+panfrost_bo_cache_evict_all(struct panfrost_device *dev);
+
+#endif /* __PAN_BO_H__ */
--- /dev/null
+/**************************************************************************
+ *
+ * Copyright 2018-2019 Alyssa Rosenzweig
+ * Copyright 2018-2019 Collabora, Ltd.
+ * Copyright © 2015 Intel Corporation
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef PAN_DEVICE_H
+#define PAN_DEVICE_H
+
+#include <xf86drm.h>
+#include "renderonly/renderonly.h"
+#include "util/u_dynarray.h"
+#include "util/bitset.h"
+#include "util/list.h"
+#include "util/sparse_array.h"
+
+#include <panfrost-job.h>
+
+/* Driver limits */
+#define PAN_MAX_CONST_BUFFERS 16
+
+/* Transient slab size. This is a balance between fragmentation against cache
+ * locality and ease of bookkeeping */
+
+#define TRANSIENT_SLAB_PAGES (32) /* 128kb */
+#define TRANSIENT_SLAB_SIZE (4096 * TRANSIENT_SLAB_PAGES)
+
+/* Maximum number of transient slabs so we don't need dynamic arrays. Most
+ * interesting Mali boards are 4GB RAM max, so if the entire RAM was filled
+ * with transient slabs, you could never exceed (4GB / TRANSIENT_SLAB_SIZE)
+ * allocations anyway. By capping, we can use a fixed-size bitset for tracking
+ * free slabs, eliminating quite a bit of complexity. We can pack the free
+ * state of 8 slabs into a single byte, so for 128kb transient slabs the bitset
+ * occupies a cheap 4kb of memory */
+
+#define MAX_TRANSIENT_SLABS (1024*1024 / TRANSIENT_SLAB_PAGES)
+
+/* How many power-of-two levels in the BO cache do we want? 2^12
+ * minimum chosen as it is the page size that all allocations are
+ * rounded to */
+
+#define MIN_BO_CACHE_BUCKET (12) /* 2^12 = 4KB */
+#define MAX_BO_CACHE_BUCKET (22) /* 2^22 = 4MB */
+
+/* Fencepost problem, hence the off-by-one */
+#define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1)
+
+/* Cache for blit shaders. Defined here so they can be cached with the device */
+
+enum pan_blit_type {
+ PAN_BLIT_FLOAT = 0,
+ PAN_BLIT_UINT,
+ PAN_BLIT_INT,
+ PAN_BLIT_NUM_TYPES,
+};
+
+#define PAN_BLIT_NUM_TARGETS (12)
+
+struct pan_blit_shaders {
+ struct panfrost_bo *bo;
+ mali_ptr loads[PAN_BLIT_NUM_TARGETS][PAN_BLIT_NUM_TYPES][2];
+};
+
+struct panfrost_device {
+ /* For ralloc */
+ void *memctx;
+
+ int fd;
+
+ /* Properties of the GPU in use */
+ unsigned gpu_id;
+ unsigned core_count;
+ unsigned thread_tls_alloc;
+ unsigned quirks;
+
+ /* Bitmask of supported compressed texture formats */
+ uint32_t compressed_formats;
+
+ /* debug flags, see pan_util.h how to interpret */
+ unsigned debug;
+
+ drmVersionPtr kernel_version;
+
+ struct renderonly *ro;
+
+ pthread_mutex_t bo_map_lock;
+ struct util_sparse_array bo_map;
+
+ struct {
+ pthread_mutex_t lock;
+
+ /* List containing all cached BOs sorted in LRU (Least
+ * Recently Used) order. This allows us to quickly evict BOs
+ * that are more than 1 second old.
+ */
+ struct list_head lru;
+
+ /* The BO cache is a set of buckets with power-of-two sizes
+ * ranging from 2^12 (4096, the page size) to
+ * 2^(12 + MAX_BO_CACHE_BUCKETS).
+ * Each bucket is a linked list of free panfrost_bo objects. */
+
+ struct list_head buckets[NR_BO_CACHE_BUCKETS];
+ } bo_cache;
+
+ struct pan_blit_shaders blit_shaders;
+};
+
+void
+panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev);
+
+void
+panfrost_close_device(struct panfrost_device *dev);
+
+bool
+panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt);
+
+static inline struct panfrost_bo *
+pan_lookup_bo(struct panfrost_device *dev, uint32_t gem_handle)
+{
+ return util_sparse_array_get(&dev->bo_map, gem_handle);
+}
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors (Collabora):
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#ifndef __PAN_ENCODER_H
+#define __PAN_ENCODER_H
+
+#include <stdbool.h>
+#include "panfrost-job.h"
+
+/* Invocation packing */
+
+void
+panfrost_pack_work_groups_compute(
+ struct mali_vertex_tiler_prefix *out,
+ unsigned num_x,
+ unsigned num_y,
+ unsigned num_z,
+ unsigned size_x,
+ unsigned size_y,
+ unsigned size_z,
+ bool quirk_graphics);
+
+void
+panfrost_pack_work_groups_fused(
+ struct mali_vertex_tiler_prefix *vertex,
+ struct mali_vertex_tiler_prefix *tiler,
+ unsigned num_x,
+ unsigned num_y,
+ unsigned num_z,
+ unsigned size_x,
+ unsigned size_y,
+ unsigned size_z);
+
+/* Tiler structure size computation */
+
+unsigned
+panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy);
+
+unsigned
+panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy);
+
+unsigned
+panfrost_choose_hierarchy_mask(
+ unsigned width, unsigned height,
+ unsigned vertex_count, bool hierarchy);
+
+/* Stack sizes */
+
+unsigned
+panfrost_get_stack_shift(unsigned stack_size);
+
+unsigned
+panfrost_get_total_stack_size(
+ unsigned stack_shift,
+ unsigned threads_per_core,
+ unsigned core_count);
+
+/* Property queries */
+
+
+unsigned panfrost_query_gpu_version(int fd);
+unsigned panfrost_query_core_count(int fd);
+unsigned panfrost_query_thread_tls_alloc(int fd);
+
+const char * panfrost_model_name(unsigned gpu_id);
+
+/* Attributes / instancing */
+
+unsigned
+panfrost_padded_vertex_count(unsigned vertex_count);
+
+unsigned
+panfrost_vertex_instanced(
+ unsigned padded_count,
+ unsigned instance_shift, unsigned instance_odd,
+ unsigned divisor,
+ union mali_attr *attrs);
+
+void panfrost_vertex_id(unsigned padded_count, union mali_attr *attr);
+void panfrost_instance_id(unsigned padded_count, union mali_attr *attr);
+
+/* Samplers */
+
+enum mali_func
+panfrost_flip_compare_func(enum mali_func f);
+
+
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include <stdio.h>
+#include "panfrost-job.h"
+#include "pan_texture.h"
+
+/* Convenience */
+
+#define _V PIPE_BIND_VERTEX_BUFFER
+#define _T PIPE_BIND_SAMPLER_VIEW
+#define _R PIPE_BIND_RENDER_TARGET
+#define _Z PIPE_BIND_DEPTH_STENCIL
+#define _VT (_V | _T)
+#define _VTR (_V | _T | _R)
+#define _TZ (_T | _Z)
+
+struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT] = {
+ [PIPE_FORMAT_ETC1_RGB8] = { MALI_ETC2_RGB8, _T },
+ [PIPE_FORMAT_ETC2_RGB8] = { MALI_ETC2_RGB8, _T },
+ [PIPE_FORMAT_ETC2_SRGB8] = { MALI_ETC2_RGB8, _T },
+ [PIPE_FORMAT_ETC2_R11_UNORM] = { MALI_ETC2_R11_UNORM, _T },
+ [PIPE_FORMAT_ETC2_RGBA8] = { MALI_ETC2_RGBA8, _T },
+ [PIPE_FORMAT_ETC2_SRGBA8] = { MALI_ETC2_RGBA8, _T },
+ [PIPE_FORMAT_ETC2_RG11_UNORM] = { MALI_ETC2_RG11_UNORM, _T },
+ [PIPE_FORMAT_ETC2_R11_SNORM] = { MALI_ETC2_R11_SNORM, _T },
+ [PIPE_FORMAT_ETC2_RG11_SNORM] = { MALI_ETC2_RG11_SNORM, _T },
+ [PIPE_FORMAT_ETC2_RGB8A1] = { MALI_ETC2_RGB8A1, _T },
+ [PIPE_FORMAT_ETC2_SRGB8A1] = { MALI_ETC2_RGB8A1, _T },
+
+ [PIPE_FORMAT_DXT1_RGB] = { MALI_BC1_UNORM, _T },
+ [PIPE_FORMAT_DXT1_RGBA] = { MALI_BC1_UNORM, _T },
+ [PIPE_FORMAT_DXT1_SRGB] = { MALI_BC1_UNORM, _T },
+ [PIPE_FORMAT_DXT1_SRGBA] = { MALI_BC1_UNORM, _T },
+ [PIPE_FORMAT_DXT3_RGBA] = { MALI_BC2_UNORM, _T },
+ [PIPE_FORMAT_DXT3_SRGBA] = { MALI_BC2_UNORM, _T },
+ [PIPE_FORMAT_DXT5_RGBA] = { MALI_BC3_UNORM, _T },
+ [PIPE_FORMAT_DXT5_SRGBA] = { MALI_BC3_UNORM, _T },
+
+ [PIPE_FORMAT_RGTC1_UNORM] = { MALI_BC4_UNORM, _T },
+ [PIPE_FORMAT_RGTC1_SNORM] = { MALI_BC4_SNORM, _T },
+ [PIPE_FORMAT_RGTC2_UNORM] = { MALI_BC5_UNORM, _T },
+ [PIPE_FORMAT_RGTC2_SNORM] = { MALI_BC5_SNORM, _T },
+
+ [PIPE_FORMAT_BPTC_RGB_FLOAT] = { MALI_BC6H_SF16, _T },
+ [PIPE_FORMAT_BPTC_RGB_UFLOAT] = { MALI_BC6H_UF16, _T },
+ [PIPE_FORMAT_BPTC_RGBA_UNORM] = { MALI_BC7_UNORM, _T },
+ [PIPE_FORMAT_BPTC_SRGBA] = { MALI_BC7_UNORM, _T },
+
+ [PIPE_FORMAT_ASTC_4x4] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_5x4] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_5x5] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_6x5] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_6x6] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_8x5] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_8x6] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_8x8] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_10x5] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_10x6] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_10x8] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_10x10] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_12x10] = { MALI_ASTC_2D_HDR, _T },
+ [PIPE_FORMAT_ASTC_12x12] = { MALI_ASTC_2D_HDR, _T },
+
+ [PIPE_FORMAT_ASTC_4x4_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_5x4_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_5x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_6x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_6x6_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_8x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_8x6_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_8x8_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_10x5_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_10x6_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_10x8_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_10x10_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_12x10_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_ASTC_12x12_SRGB] = { MALI_ASTC_2D_LDR, _T },
+ [PIPE_FORMAT_B5G6R5_UNORM] = { MALI_RGB565, _VTR },
+ [PIPE_FORMAT_B5G5R5X1_UNORM] = { MALI_RGB5_X1_UNORM, _VT },
+ [PIPE_FORMAT_R5G5B5A1_UNORM] = { MALI_RGB5_A1_UNORM, _VTR },
+
+ [PIPE_FORMAT_R10G10B10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
+ [PIPE_FORMAT_B10G10R10X2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
+ [PIPE_FORMAT_R10G10B10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
+ [PIPE_FORMAT_B10G10R10A2_UNORM] = { MALI_RGB10_A2_UNORM, _VTR },
+ [PIPE_FORMAT_R10G10B10X2_SNORM] = { MALI_RGB10_A2_SNORM, _VT },
+ [PIPE_FORMAT_R10G10B10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT },
+ [PIPE_FORMAT_B10G10R10A2_SNORM] = { MALI_RGB10_A2_SNORM, _VT },
+ [PIPE_FORMAT_R10G10B10A2_UINT] = { MALI_RGB10_A2UI, _VTR },
+ [PIPE_FORMAT_B10G10R10A2_UINT] = { MALI_RGB10_A2UI, _VTR },
+ [PIPE_FORMAT_R10G10B10A2_USCALED] = { MALI_RGB10_A2UI, _VTR },
+ [PIPE_FORMAT_B10G10R10A2_USCALED] = { MALI_RGB10_A2UI, _VTR },
+ [PIPE_FORMAT_R10G10B10A2_SINT] = { MALI_RGB10_A2I, _VTR},
+ [PIPE_FORMAT_B10G10R10A2_SINT] = { MALI_RGB10_A2I, _VTR },
+ [PIPE_FORMAT_R10G10B10A2_SSCALED] = { MALI_RGB10_A2I, _VTR },
+ [PIPE_FORMAT_B10G10R10A2_SSCALED] = { MALI_RGB10_A2I, _VTR },
+
+ [PIPE_FORMAT_R8_SSCALED] = { MALI_R8I, _V },
+ [PIPE_FORMAT_R8G8_SSCALED] = { MALI_RG8I, _V },
+ [PIPE_FORMAT_R8G8B8_SSCALED] = { MALI_RGB8I, _V },
+ [PIPE_FORMAT_B8G8R8_SSCALED] = { MALI_RGB8I, _V },
+ [PIPE_FORMAT_R8G8B8A8_SSCALED] = { MALI_RGBA8I, _V },
+ [PIPE_FORMAT_B8G8R8A8_SSCALED] = { MALI_RGBA8I, _V },
+ [PIPE_FORMAT_A8B8G8R8_SSCALED] = { MALI_RGBA8I, _V },
+
+ [PIPE_FORMAT_R8_USCALED] = { MALI_R8UI, _V },
+ [PIPE_FORMAT_R8G8_USCALED] = { MALI_RG8UI, _V },
+ [PIPE_FORMAT_R8G8B8_USCALED] = { MALI_RGB8UI, _V },
+ [PIPE_FORMAT_B8G8R8_USCALED] = { MALI_RGB8UI, _V },
+ [PIPE_FORMAT_R8G8B8A8_USCALED] = { MALI_RGBA8UI, _V },
+ [PIPE_FORMAT_B8G8R8A8_USCALED] = { MALI_RGBA8UI, _V },
+ [PIPE_FORMAT_A8B8G8R8_USCALED] = { MALI_RGBA8UI, _V },
+
+ [PIPE_FORMAT_R16_USCALED] = { MALI_R16UI, _V },
+ [PIPE_FORMAT_R16G16_USCALED] = { MALI_RG16UI, _V },
+ [PIPE_FORMAT_R16G16B16_USCALED] = { MALI_RGB16UI, _V },
+ [PIPE_FORMAT_R16G16B16A16_USCALED] = { MALI_RGBA16UI, _V },
+ [PIPE_FORMAT_R16_SSCALED] = { MALI_R16I, _V },
+ [PIPE_FORMAT_R16G16_SSCALED] = { MALI_RG16I, _V },
+ [PIPE_FORMAT_R16G16B16_SSCALED] = { MALI_RGB16I, _V },
+ [PIPE_FORMAT_R16G16B16A16_SSCALED] = { MALI_RGBA16I, _V },
+
+ [PIPE_FORMAT_R32_USCALED] = { MALI_R32UI, _V },
+ [PIPE_FORMAT_R32G32_USCALED] = { MALI_RG32UI, _V },
+ [PIPE_FORMAT_R32G32B32_USCALED] = { MALI_RGB32UI, _V },
+ [PIPE_FORMAT_R32G32B32A32_USCALED] = { MALI_RGBA32UI, _V },
+ [PIPE_FORMAT_R32_SSCALED] = { MALI_R32I, _V },
+ [PIPE_FORMAT_R32G32_SSCALED] = { MALI_RG32I, _V },
+ [PIPE_FORMAT_R32G32B32_SSCALED] = { MALI_RGB32I, _V },
+ [PIPE_FORMAT_R32G32B32A32_SSCALED] = { MALI_RGBA32I, _V },
+
+ [PIPE_FORMAT_R3G3B2_UNORM] = { MALI_RGB332_UNORM, _VT },
+
+ [PIPE_FORMAT_Z24_UNORM_S8_UINT] = { MALI_Z24X8_UNORM, _TZ },
+ [PIPE_FORMAT_Z24X8_UNORM] = { MALI_Z24X8_UNORM, _TZ },
+ [PIPE_FORMAT_Z32_FLOAT] = { MALI_R32F, _TZ },
+ [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = { MALI_R32F, _TZ },
+ [PIPE_FORMAT_X32_S8X24_UINT] = { MALI_R8UI, _T },
+ [PIPE_FORMAT_X24S8_UINT] = { MALI_RGBA8UI, _TZ },
+ [PIPE_FORMAT_S8_UINT] = { MALI_R8UI, _T },
+
+ [PIPE_FORMAT_R32_FIXED] = { MALI_R32_FIXED, _V },
+ [PIPE_FORMAT_R32G32_FIXED] = { MALI_RG32_FIXED, _V },
+ [PIPE_FORMAT_R32G32B32_FIXED] = { MALI_RGB32_FIXED, _V },
+ [PIPE_FORMAT_R32G32B32A32_FIXED] = { MALI_RGBA32_FIXED, _V },
+
+ [PIPE_FORMAT_R11G11B10_FLOAT] = { MALI_R11F_G11F_B10F, _VTR},
+ [PIPE_FORMAT_R9G9B9E5_FLOAT] = { MALI_R9F_G9F_B9F_E5F, _VT },
+
+ [PIPE_FORMAT_R8_SNORM] = { MALI_R8_SNORM, _VT },
+ [PIPE_FORMAT_R16_SNORM] = { MALI_R16_SNORM, _VT },
+ [PIPE_FORMAT_R32_SNORM] = { MALI_R32_SNORM, _VT },
+ [PIPE_FORMAT_R8G8_SNORM] = { MALI_RG8_SNORM, _VT },
+ [PIPE_FORMAT_R16G16_SNORM] = { MALI_RG16_SNORM, _VT },
+ [PIPE_FORMAT_R32G32_SNORM] = { MALI_RG32_SNORM, _VT },
+ [PIPE_FORMAT_R8G8B8_SNORM] = { MALI_RGB8_SNORM, _VT },
+ [PIPE_FORMAT_R16G16B16_SNORM] = { MALI_RGB16_SNORM, _VT },
+ [PIPE_FORMAT_R32G32B32_SNORM] = { MALI_RGB32_SNORM, _VT },
+ [PIPE_FORMAT_R8G8B8A8_SNORM] = { MALI_RGBA8_SNORM, _VT },
+ [PIPE_FORMAT_R16G16B16A16_SNORM] = { MALI_RGBA16_SNORM, _VT },
+ [PIPE_FORMAT_R32G32B32A32_SNORM] = { MALI_RGBA32_SNORM, _VT },
+
+ [PIPE_FORMAT_A8_SINT] = { MALI_R8I, _VTR },
+ [PIPE_FORMAT_I8_SINT] = { MALI_R8I, _VTR },
+ [PIPE_FORMAT_L8_SINT] = { MALI_R8I, _VTR },
+ [PIPE_FORMAT_L8A8_SINT] = { MALI_RG8I, _VTR },
+ [PIPE_FORMAT_A8_UINT] = { MALI_R8UI, _VTR },
+ [PIPE_FORMAT_I8_UINT] = { MALI_R8UI, _VTR },
+ [PIPE_FORMAT_L8_UINT] = { MALI_R8UI, _VTR },
+ [PIPE_FORMAT_L8A8_UINT] = { MALI_RG8UI, _VTR },
+
+ [PIPE_FORMAT_A16_SINT] = { MALI_R16I, _VTR },
+ [PIPE_FORMAT_I16_SINT] = { MALI_R16I, _VTR },
+ [PIPE_FORMAT_L16_SINT] = { MALI_R16I, _VTR },
+ [PIPE_FORMAT_L16A16_SINT] = { MALI_RG16I, _VTR },
+ [PIPE_FORMAT_A16_UINT] = { MALI_R16UI, _VTR },
+ [PIPE_FORMAT_I16_UINT] = { MALI_R16UI, _VTR },
+ [PIPE_FORMAT_L16_UINT] = { MALI_R16UI, _VTR },
+ [PIPE_FORMAT_L16A16_UINT] = { MALI_RG16UI, _VTR },
+
+ [PIPE_FORMAT_A32_SINT] = { MALI_R32I, _VTR },
+ [PIPE_FORMAT_I32_SINT] = { MALI_R32I, _VTR },
+ [PIPE_FORMAT_L32_SINT] = { MALI_R32I, _VTR },
+ [PIPE_FORMAT_L32A32_SINT] = { MALI_RG32I, _VTR },
+ [PIPE_FORMAT_A32_UINT] = { MALI_R32UI, _VTR },
+ [PIPE_FORMAT_I32_UINT] = { MALI_R32UI, _VTR },
+ [PIPE_FORMAT_L32_UINT] = { MALI_R32UI, _VTR },
+ [PIPE_FORMAT_L32A32_UINT] = { MALI_RG32UI, _VTR },
+
+ [PIPE_FORMAT_B8G8R8_UINT] = { MALI_RGB8UI, _VTR },
+ [PIPE_FORMAT_B8G8R8A8_UINT] = { MALI_RGBA8UI, _VTR },
+ [PIPE_FORMAT_B8G8R8_SINT] = { MALI_RGB8I, _VTR },
+ [PIPE_FORMAT_B8G8R8A8_SINT] = { MALI_RGBA8I, _VTR },
+ [PIPE_FORMAT_A8R8G8B8_UINT] = { MALI_RGBA8UI, _VTR },
+ [PIPE_FORMAT_A8B8G8R8_UINT] = { MALI_RGBA8UI, _VTR },
+
+ [PIPE_FORMAT_R8_UINT] = { MALI_R8UI, _VTR },
+ [PIPE_FORMAT_R16_UINT] = { MALI_R16UI, _VTR },
+ [PIPE_FORMAT_R32_UINT] = { MALI_R32UI, _VTR },
+ [PIPE_FORMAT_R8G8_UINT] = { MALI_RG8UI, _VTR },
+ [PIPE_FORMAT_R16G16_UINT] = { MALI_RG16UI, _VTR },
+ [PIPE_FORMAT_R32G32_UINT] = { MALI_RG32UI, _VTR },
+ [PIPE_FORMAT_R8G8B8_UINT] = { MALI_RGB8UI, _VTR },
+ [PIPE_FORMAT_R16G16B16_UINT] = { MALI_RGB16UI, _VTR },
+ [PIPE_FORMAT_R32G32B32_UINT] = { MALI_RGB32UI, _VTR },
+ [PIPE_FORMAT_R8G8B8A8_UINT] = { MALI_RGBA8UI, _VTR },
+ [PIPE_FORMAT_R16G16B16A16_UINT] = { MALI_RGBA16UI, _VTR },
+ [PIPE_FORMAT_R32G32B32A32_UINT] = { MALI_RGBA32UI, _VTR },
+
+ [PIPE_FORMAT_R32_FLOAT] = { MALI_R32F, _VTR },
+ [PIPE_FORMAT_R32G32_FLOAT] = { MALI_RG32F, _VTR },
+ [PIPE_FORMAT_R32G32B32_FLOAT] = { MALI_RGB32F, _VTR },
+ [PIPE_FORMAT_R32G32B32A32_FLOAT] = { MALI_RGBA32F, _VTR },
+
+ [PIPE_FORMAT_R8_UNORM] = { MALI_R8_UNORM, _VTR },
+ [PIPE_FORMAT_R16_UNORM] = { MALI_R16_UNORM, _VTR },
+ [PIPE_FORMAT_R32_UNORM] = { MALI_R32_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8_UNORM] = { MALI_RG8_UNORM, _VTR },
+ [PIPE_FORMAT_R16G16_UNORM] = { MALI_RG16_UNORM, _VTR },
+ [PIPE_FORMAT_R32G32_UNORM] = { MALI_RG32_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8B8_UNORM] = { MALI_RGB8_UNORM, _VTR },
+ [PIPE_FORMAT_R16G16B16_UNORM] = { MALI_RGB16_UNORM, _VTR },
+ [PIPE_FORMAT_R32G32B32_UNORM] = { MALI_RGB32_UNORM, _VTR },
+ [PIPE_FORMAT_R4G4B4A4_UNORM] = { MALI_RGBA4_UNORM, _VTR },
+ [PIPE_FORMAT_R16G16B16A16_UNORM] = { MALI_RGBA16_UNORM, _VTR },
+ [PIPE_FORMAT_R32G32B32A32_UNORM] = { MALI_RGBA32_UNORM, _VTR },
+
+ [PIPE_FORMAT_B8G8R8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_B8G8R8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_A8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_X8R8G8B8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_A8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_X8B8G8R8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8B8X8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8B8A8_UNORM] = { MALI_RGBA8_UNORM, _VTR },
+
+ [PIPE_FORMAT_R8G8B8X8_SNORM] = { MALI_RGBA8_SNORM, _VT },
+ [PIPE_FORMAT_R8G8B8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8B8X8_UINT] = { MALI_RGBA8UI, _VTR },
+ [PIPE_FORMAT_R8G8B8X8_SINT] = { MALI_RGBA8I, _VTR },
+
+ [PIPE_FORMAT_L8_UNORM] = { MALI_R8_UNORM, _VTR },
+ [PIPE_FORMAT_A8_UNORM] = { MALI_R8_UNORM, _VTR },
+ [PIPE_FORMAT_I8_UNORM] = { MALI_R8_UNORM, _VTR },
+ [PIPE_FORMAT_L8A8_UNORM] = { MALI_RG8_UNORM, _VTR },
+ [PIPE_FORMAT_L16_UNORM] = { MALI_R16_UNORM, _VTR },
+ [PIPE_FORMAT_A16_UNORM] = { MALI_R16_UNORM, _VTR },
+ [PIPE_FORMAT_I16_UNORM] = { MALI_R16_UNORM, _VTR },
+ [PIPE_FORMAT_L16A16_UNORM] = { MALI_RG16_UNORM, _VTR },
+
+ [PIPE_FORMAT_L8_SNORM] = { MALI_R8_SNORM, _VT },
+ [PIPE_FORMAT_A8_SNORM] = { MALI_R8_SNORM, _VT },
+ [PIPE_FORMAT_I8_SNORM] = { MALI_R8_SNORM, _VT },
+ [PIPE_FORMAT_L8A8_SNORM] = { MALI_RG8_SNORM, _VT },
+ [PIPE_FORMAT_L16_SNORM] = { MALI_R16_SNORM, _VT },
+ [PIPE_FORMAT_A16_SNORM] = { MALI_R16_SNORM, _VT },
+ [PIPE_FORMAT_I16_SNORM] = { MALI_R16_SNORM, _VT },
+ [PIPE_FORMAT_L16A16_SNORM] = { MALI_RG16_SNORM, _VT },
+
+ [PIPE_FORMAT_L16_FLOAT] = { MALI_R16F, _VTR },
+ [PIPE_FORMAT_A16_FLOAT] = { MALI_R16F, _VTR },
+ [PIPE_FORMAT_I16_FLOAT] = { MALI_RG16F, _VTR },
+ [PIPE_FORMAT_L16A16_FLOAT] = { MALI_RG16F, _VTR },
+
+ [PIPE_FORMAT_L8_SRGB] = { MALI_R8_UNORM, _VTR },
+ [PIPE_FORMAT_R8_SRGB] = { MALI_R8_UNORM, _VTR },
+ [PIPE_FORMAT_L8A8_SRGB] = { MALI_RG8_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8_SRGB] = { MALI_RG8_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8B8_SRGB] = { MALI_RGB8_UNORM, _VTR },
+ [PIPE_FORMAT_B8G8R8_SRGB] = { MALI_RGB8_UNORM, _VTR },
+ [PIPE_FORMAT_R8G8B8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_A8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_X8B8G8R8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_B8G8R8A8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_B8G8R8X8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_A8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+ [PIPE_FORMAT_X8R8G8B8_SRGB] = { MALI_RGBA8_UNORM, _VTR },
+
+ [PIPE_FORMAT_R8_SINT] = { MALI_R8I, _VTR },
+ [PIPE_FORMAT_R16_SINT] = { MALI_R16I, _VTR },
+ [PIPE_FORMAT_R32_SINT] = { MALI_R32I, _VTR },
+ [PIPE_FORMAT_R16_FLOAT] = { MALI_R16F, _VTR },
+ [PIPE_FORMAT_R8G8_SINT] = { MALI_RG8I, _VTR },
+ [PIPE_FORMAT_R16G16_SINT] = { MALI_RG16I, _VTR },
+ [PIPE_FORMAT_R32G32_SINT] = { MALI_RG32I, _VTR },
+ [PIPE_FORMAT_R16G16_FLOAT] = { MALI_RG16F, _VTR },
+ [PIPE_FORMAT_R8G8B8_SINT] = { MALI_RGB8I, _VTR },
+ [PIPE_FORMAT_R16G16B16_SINT] = { MALI_RGB16I, _VTR },
+ [PIPE_FORMAT_R32G32B32_SINT] = { MALI_RGB32I, _VTR },
+ [PIPE_FORMAT_R16G16B16_FLOAT] = { MALI_RGB16F, _VTR },
+ [PIPE_FORMAT_R8G8B8A8_SINT] = { MALI_RGBA8I, _VTR },
+ [PIPE_FORMAT_R16G16B16A16_SINT] = { MALI_RGBA16I, _VTR },
+ [PIPE_FORMAT_R32G32B32A32_SINT] = { MALI_RGBA32I, _VTR },
+ [PIPE_FORMAT_R16G16B16A16_FLOAT] = { MALI_RGBA16F, _VTR },
+
+ [PIPE_FORMAT_R16G16B16X16_UNORM] = { MALI_RGBA16_UNORM, _VTR },
+ [PIPE_FORMAT_R16G16B16X16_SNORM] = { MALI_RGBA16_SNORM, _VT },
+ [PIPE_FORMAT_R16G16B16X16_FLOAT] = { MALI_RGBA16F, _VTR },
+ [PIPE_FORMAT_R16G16B16X16_UINT] = { MALI_RGBA16UI, _VTR },
+ [PIPE_FORMAT_R16G16B16X16_SINT] = { MALI_RGBA16I, _VTR },
+
+ [PIPE_FORMAT_R32G32B32X32_FLOAT] = { MALI_RGBA32F, _VTR },
+ [PIPE_FORMAT_R32G32B32X32_UINT] = { MALI_RGBA32UI, _VTR },
+ [PIPE_FORMAT_R32G32B32X32_SINT] = { MALI_RGBA32I, _VTR },
+};
+
+#undef _VTR
+#undef _VT
+#undef _V
+#undef _T
+#undef _R
+
+/* Is a format encoded like Z24S8 and therefore compatible for render? */
+
+bool
+panfrost_is_z24s8_variant(enum pipe_format fmt)
+{
+ switch (fmt) {
+ case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+ case PIPE_FORMAT_Z24X8_UNORM:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* Translate a PIPE swizzle quad to a 12-bit Mali swizzle code. PIPE
+ * swizzles line up with Mali swizzles for the XYZW01, but PIPE swizzles have
+ * an additional "NONE" field that we have to mask out to zero. Additionally,
+ * PIPE swizzles are sparse but Mali swizzles are packed */
+
+unsigned
+panfrost_translate_swizzle_4(const unsigned char swizzle[4])
+{
+ unsigned out = 0;
+
+ for (unsigned i = 0; i < 4; ++i) {
+ unsigned translated = (swizzle[i] > PIPE_SWIZZLE_1) ? PIPE_SWIZZLE_0 : swizzle[i];
+ out |= (translated << (3*i));
+ }
+
+ return out;
+}
+
+void
+panfrost_invert_swizzle(const unsigned char *in, unsigned char *out)
+{
+ /* First, default to all zeroes to prevent uninitialized junk */
+
+ for (unsigned c = 0; c < 4; ++c)
+ out[c] = PIPE_SWIZZLE_0;
+
+ /* Now "do" what the swizzle says */
+
+ for (unsigned c = 0; c < 4; ++c) {
+ unsigned char i = in[c];
+
+ /* Who cares? */
+ assert(PIPE_SWIZZLE_X == 0);
+ if (i > PIPE_SWIZZLE_W)
+ continue;
+
+ /* Invert */
+ unsigned idx = i - PIPE_SWIZZLE_X;
+ out[idx] = PIPE_SWIZZLE_X + c;
+ }
+}
+
+enum mali_format
+panfrost_format_to_bifrost_blend(const struct util_format_description *desc)
+{
+ enum mali_format format = panfrost_pipe_format_table[desc->format].hw;
+ assert(format);
+
+ switch (format) {
+ case MALI_RGBA4_UNORM:
+ return MALI_RGBA4;
+ case MALI_RGBA8_UNORM:
+ case MALI_RGB8_UNORM:
+ return MALI_RGBA8_2;
+ case MALI_RGB10_A2_UNORM:
+ return MALI_RGB10_A2_2;
+ default:
+ return format;
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors (Collabora):
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ *
+ */
+
+#include <assert.h>
+#include "util/u_math.h"
+#include "pan_encoder.h"
+
+/* Compute shaders are invoked with a gl_NumWorkGroups X/Y/Z triplet. Vertex
+ * shaders, it turns out, are invoked with the same mechanism, with the triplet
+ * (1, vertex_count, instance_count).
+ *
+ * Alongside this triplet is the gl_WorkGroupSize X/Y/Z triplet.
+ *
+ * Unfortunately, the packing for these triplet into the
+ * mali_vertex_tiler_prefix is a little funky, using a dynamic bitfield. The
+ * routines here exist to pack this */
+
+void
+panfrost_pack_work_groups_compute(
+ struct mali_vertex_tiler_prefix *out,
+ unsigned num_x,
+ unsigned num_y,
+ unsigned num_z,
+ unsigned size_x,
+ unsigned size_y,
+ unsigned size_z,
+ bool quirk_graphics)
+{
+ uint32_t packed = 0;
+
+ /* The values needing packing, in order, and the corresponding shifts.
+ * Indicies into shift are off-by-one to make the logic easier */
+
+ unsigned shifts[7] = { 0 };
+
+ unsigned values[6] = {
+ MALI_POSITIVE(size_x),
+ MALI_POSITIVE(size_y),
+ MALI_POSITIVE(size_z),
+ MALI_POSITIVE(num_x),
+ MALI_POSITIVE(num_y),
+ MALI_POSITIVE(num_z),
+ };
+
+ for (unsigned i = 0; i < 6; ++i) {
+ /* OR it in, shifting as required */
+ packed |= (values[i] << shifts[i]);
+
+ /* How many bits did we use? */
+ unsigned bit_count = util_logbase2_ceil(values[i] + 1);
+
+ /* Set the next shift accordingly */
+ shifts[i + 1] = shifts[i] + bit_count;
+ }
+
+ /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift
+ * = 32. This doesn't appear to matter to the hardware, but it's good
+ * to be bit-identical. */
+
+ if (quirk_graphics && (num_z <= 1))
+ shifts[5] = 32;
+
+ /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2,
+ * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL
+ * compute, it is always 2 if no barriers are in use, but is equal to
+ * workgroups_x_shift is barriers are in use. */
+
+ unsigned shift_2 = shifts[3];
+
+ if (quirk_graphics)
+ shift_2 = MAX2(shift_2, 2);
+
+ /* Pack them in */
+ uint32_t packed_shifts =
+ (shifts[1] << 0) |
+ (shifts[2] << 5) |
+ (shifts[3] << 10) |
+ (shifts[4] << 16) |
+ (shifts[5] << 22) |
+ (shift_2 << 28);
+
+ /* Upload the packed bitfields */
+ out->invocation_count = packed;
+ out->invocation_shifts = packed_shifts;
+
+ /* TODO: Compute workgroups_x_shift_3 */
+ out->workgroups_x_shift_3 = shift_2;
+}
+
+/* Packs vertex/tiler descriptors simultaneously */
+void
+panfrost_pack_work_groups_fused(
+ struct mali_vertex_tiler_prefix *vertex,
+ struct mali_vertex_tiler_prefix *tiler,
+ unsigned num_x,
+ unsigned num_y,
+ unsigned num_z,
+ unsigned size_x,
+ unsigned size_y,
+ unsigned size_z)
+{
+ panfrost_pack_work_groups_compute(vertex, num_x, num_y, num_z, size_x, size_y, size_z, true);
+
+ /* Copy results over */
+ tiler->invocation_count = vertex->invocation_count;
+ tiler->invocation_shifts = vertex->invocation_shifts;
+
+ /* Set special fields for each */
+ vertex->workgroups_x_shift_3 = 5;
+ tiler->workgroups_x_shift_3 = 6;
+}
+
--- /dev/null
+/*
+ * © Copyright 2018 Alyssa Rosenzweig
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "util/hash_table.h"
+#include "pan_bo.h"
+#include "pan_pool.h"
+
+/* TODO: What does this actually have to be? */
+#define ALIGNMENT 128
+
+/* Transient command stream pooling: command stream uploads try to simply copy
+ * into whereever we left off. If there isn't space, we allocate a new entry
+ * into the pool and copy there */
+
+struct pan_pool
+panfrost_create_pool(void *memctx, struct panfrost_device *dev)
+{
+ struct pan_pool pool = {
+ .dev = dev,
+ .transient_offset = 0,
+ .transient_bo = NULL
+ };
+
+ pool.bos = _mesa_hash_table_create(memctx, _mesa_hash_pointer,
+ _mesa_key_pointer_equal);
+
+
+ return pool;
+}
+
+struct panfrost_transfer
+panfrost_pool_alloc(struct pan_pool *pool, size_t sz)
+{
+ /* Pad the size */
+ sz = ALIGN_POT(sz, ALIGNMENT);
+
+ /* Find or create a suitable BO */
+ struct panfrost_bo *bo = NULL;
+
+ unsigned offset = 0;
+
+ bool fits_in_current = (pool->transient_offset + sz) < TRANSIENT_SLAB_SIZE;
+
+ if (likely(pool->transient_bo && fits_in_current)) {
+ /* We can reuse the current BO, so get it */
+ bo = pool->transient_bo;
+
+ /* Use the specified offset */
+ offset = pool->transient_offset;
+ pool->transient_offset = offset + sz;
+ } else {
+ size_t bo_sz = sz < TRANSIENT_SLAB_SIZE ?
+ TRANSIENT_SLAB_SIZE : ALIGN_POT(sz, 4096);
+
+ /* We can't reuse the current BO, but we can create a new one.
+ * We don't know what the BO will be used for, so let's flag it
+ * RW and attach it to both the fragment and vertex/tiler jobs.
+ * TODO: if we want fine grained BO assignment we should pass
+ * flags to this function and keep the read/write,
+ * fragment/vertex+tiler pools separate.
+ */
+ bo = panfrost_bo_create(pool->dev, bo_sz, 0);
+
+ uintptr_t flags = PAN_BO_ACCESS_PRIVATE |
+ PAN_BO_ACCESS_RW |
+ PAN_BO_ACCESS_VERTEX_TILER |
+ PAN_BO_ACCESS_FRAGMENT;
+
+ _mesa_hash_table_insert(pool->bos, bo, (void *) flags);
+
+ if (sz < TRANSIENT_SLAB_SIZE) {
+ pool->transient_bo = bo;
+ pool->transient_offset = offset + sz;
+ }
+ }
+
+ struct panfrost_transfer ret = {
+ .cpu = bo->cpu + offset,
+ .gpu = bo->gpu + offset,
+ };
+
+ return ret;
+
+}
+
+mali_ptr
+panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz)
+{
+ struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sz);
+ memcpy(transfer.cpu, data, sz);
+ return transfer.gpu;
+}
--- /dev/null
+/*
+ * © Copyright 2017-2018 Alyssa Rosenzweig
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PAN_POOL_H__
+#define __PAN_POOL_H__
+
+#include <stddef.h>
+#include <panfrost-job.h>
+
+/* Represents a pool of memory that can only grow, used to allocate objects
+ * with the same lifetime as the pool itself. In OpenGL, a pool is owned by the
+ * batch for transient structures. In Vulkan, it may be owned by e.g. the
+ * command pool */
+
+struct pan_pool {
+ /* Parent device for allocation */
+ struct panfrost_device *dev;
+
+ /* panfrost_bo -> access_flags owned by the pool */
+ struct hash_table *bos;
+
+ /* Current transient BO */
+ struct panfrost_bo *transient_bo;
+
+ /* Within the topmost transient BO, how much has been used? */
+ unsigned transient_offset;
+};
+
+struct pan_pool
+panfrost_create_pool(void *memctx, struct panfrost_device *dev);
+
+/* Represents a fat pointer for GPU-mapped memory, returned from the transient
+ * allocator and not used for much else */
+
+struct panfrost_transfer {
+ uint8_t *cpu;
+ mali_ptr gpu;
+};
+
+struct panfrost_transfer
+panfrost_pool_alloc(struct pan_pool *pool, size_t sz);
+
+mali_ptr
+panfrost_pool_upload(struct pan_pool *pool, const void *data, size_t sz);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include <xf86drm.h>
+
+#include "util/u_math.h"
+#include "util/macros.h"
+#include "util/hash_table.h"
+#include "util/u_thread.h"
+#include "drm-uapi/panfrost_drm.h"
+#include "pan_encoder.h"
+#include "pan_device.h"
+#include "panfrost-quirks.h"
+#include "pan_bo.h"
+
+/* Abstraction over the raw drm_panfrost_get_param ioctl for fetching
+ * information about devices */
+
+static __u64
+panfrost_query_raw(
+ int fd,
+ enum drm_panfrost_param param,
+ bool required,
+ unsigned default_value)
+{
+ struct drm_panfrost_get_param get_param = {0,};
+ ASSERTED int ret;
+
+ get_param.param = param;
+ ret = drmIoctl(fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
+
+ if (ret) {
+ assert(!required);
+ return default_value;
+ }
+
+ return get_param.value;
+}
+
+unsigned
+panfrost_query_gpu_version(int fd)
+{
+ return panfrost_query_raw(fd, DRM_PANFROST_PARAM_GPU_PROD_ID, true, 0);
+}
+
+unsigned
+panfrost_query_core_count(int fd)
+{
+ /* On older kernels, worst-case to 16 cores */
+
+ unsigned mask = panfrost_query_raw(fd,
+ DRM_PANFROST_PARAM_SHADER_PRESENT, false, 0xffff);
+
+ return util_bitcount(mask);
+}
+
+unsigned
+panfrost_query_thread_tls_alloc(int fd)
+{
+ /* On older kernels, we worst-case to 256 threads, the architectural
+ * maximum for Midgard. On my current kernel/hardware, I'm seeing this
+ * readback as 0, so we'll worst-case there too */
+
+ unsigned tls = panfrost_query_raw(fd,
+ DRM_PANFROST_PARAM_THREAD_TLS_ALLOC, false, 256);
+
+ if (tls)
+ return tls;
+ else
+ return 256;
+}
+
+static uint32_t
+panfrost_query_compressed_formats(int fd)
+{
+ /* If unspecified, assume ASTC/ETC only. Factory default for Juno, and
+ * should exist on any Mali configuration. All hardware should report
+ * these texture formats but the kernel might not be new enough. */
+
+ uint32_t default_set =
+ (1 << MALI_ETC2_RGB8) |
+ (1 << MALI_ETC2_R11_UNORM) |
+ (1 << MALI_ETC2_RGBA8) |
+ (1 << MALI_ETC2_RG11_UNORM) |
+ (1 << MALI_ETC2_R11_SNORM) |
+ (1 << MALI_ETC2_RG11_SNORM) |
+ (1 << MALI_ETC2_RGB8A1) |
+ (1 << MALI_ASTC_3D_LDR) |
+ (1 << MALI_ASTC_3D_HDR) |
+ (1 << MALI_ASTC_2D_LDR) |
+ (1 << MALI_ASTC_2D_HDR);
+
+ return panfrost_query_raw(fd, DRM_PANFROST_PARAM_TEXTURE_FEATURES0,
+ false, default_set);
+}
+
+/* DRM_PANFROST_PARAM_TEXTURE_FEATURES0 will return a bitmask of supported
+ * compressed formats, so we offer a helper to test if a format is supported */
+
+bool
+panfrost_supports_compressed_format(struct panfrost_device *dev, unsigned fmt)
+{
+ if (MALI_EXTRACT_TYPE(fmt) != MALI_FORMAT_COMPRESSED)
+ return true;
+
+ unsigned idx = fmt & ~MALI_FORMAT_COMPRESSED;
+ assert(idx < 32);
+
+ return dev->compressed_formats & (1 << idx);
+}
+
+/* Given a GPU ID like 0x860, return a prettified model name */
+
+const char *
+panfrost_model_name(unsigned gpu_id)
+{
+ switch (gpu_id) {
+ case 0x600: return "Mali T600 (Panfrost)";
+ case 0x620: return "Mali T620 (Panfrost)";
+ case 0x720: return "Mali T720 (Panfrost)";
+ case 0x820: return "Mali T820 (Panfrost)";
+ case 0x830: return "Mali T830 (Panfrost)";
+ case 0x750: return "Mali T760 (Panfrost)";
+ case 0x860: return "Mali T860 (Panfrost)";
+ case 0x880: return "Mali T880 (Panfrost)";
+ case 0x7093: return "Mali G31 (Panfrost)";
+ case 0x7212: return "Mali G52 (Panfrost)";
+ default:
+ unreachable("Invalid GPU ID");
+ }
+}
+
+void
+panfrost_open_device(void *memctx, int fd, struct panfrost_device *dev)
+{
+ dev->fd = fd;
+ dev->memctx = memctx;
+ dev->gpu_id = panfrost_query_gpu_version(fd);
+ dev->core_count = panfrost_query_core_count(fd);
+ dev->thread_tls_alloc = panfrost_query_thread_tls_alloc(fd);
+ dev->kernel_version = drmGetVersion(fd);
+ dev->quirks = panfrost_get_quirks(dev->gpu_id);
+ dev->compressed_formats = panfrost_query_compressed_formats(fd);
+
+ util_sparse_array_init(&dev->bo_map, sizeof(struct panfrost_bo), 512);
+
+ pthread_mutex_init(&dev->bo_cache.lock, NULL);
+ list_inithead(&dev->bo_cache.lru);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(dev->bo_cache.buckets); ++i)
+ list_inithead(&dev->bo_cache.buckets[i]);
+}
+
+void
+panfrost_close_device(struct panfrost_device *dev)
+{
+ panfrost_bo_unreference(dev->blit_shaders.bo);
+ panfrost_bo_cache_evict_all(dev);
+ pthread_mutex_destroy(&dev->bo_cache.lock);
+ drmFreeVersion(dev->kernel_version);
+ util_sparse_array_finish(&dev->bo_map);
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "pan_encoder.h"
+
+/* Sampler comparison functions are flipped in OpenGL from the hardware, so we
+ * need to be able to flip accordingly */
+
+enum mali_func
+panfrost_flip_compare_func(enum mali_func f)
+{
+ switch (f) {
+ case MALI_FUNC_LESS:
+ return MALI_FUNC_GREATER;
+ case MALI_FUNC_GREATER:
+ return MALI_FUNC_LESS;
+ case MALI_FUNC_LEQUAL:
+ return MALI_FUNC_GEQUAL;
+ case MALI_FUNC_GEQUAL:
+ return MALI_FUNC_LEQUAL;
+ default:
+ return f;
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <string.h>
+#include "pan_scoreboard.h"
+#include "pan_device.h"
+#include "panfrost-quirks.h"
+
+/*
+ * There are various types of Mali jobs:
+ *
+ * - WRITE_VALUE: generic write primitive, used to zero tiler field
+ * - VERTEX: runs a vertex shader
+ * - TILER: runs tiling and sets up a fragment shader
+ * - FRAGMENT: runs fragment shaders and writes out
+ * - COMPUTE: runs a compute shader
+ * - FUSED: vertex+tiler fused together, implicit intradependency (Bifrost)
+ * - GEOMETRY: runs a geometry shader (unimplemented)
+ * - CACHE_FLUSH: unseen in the wild, theoretically cache flush
+ *
+ * In between a full batch and a single Mali job is the "job chain", a series
+ * of Mali jobs together forming a linked list. Within the job chain, each Mali
+ * job can set (up to) two dependencies on other earlier jobs in the chain.
+ * This dependency graph forms a scoreboard. The general idea of a scoreboard
+ * applies: when there is a data dependency of job B on job A, job B sets one
+ * of its dependency indices to job A, ensuring that job B won't start until
+ * job A finishes.
+ *
+ * More specifically, here are a set of rules:
+ *
+ * - A write value job must appear if and only if there is at least one tiler
+ * job, and tiler jobs must depend on it.
+ *
+ * - Vertex jobs and tiler jobs are independent.
+ *
+ * - A tiler job must have a dependency on its data source. If it's getting
+ * data from a vertex job, it depends on the vertex job. If it's getting data
+ * from software, this is null.
+ *
+ * - Tiler jobs must depend on the write value job (chained or otherwise).
+ *
+ * - Tiler jobs must be strictly ordered. So each tiler job must depend on the
+ * previous job in the chain.
+ *
+ * - Jobs linking via next_job has no bearing on order of execution, rather it
+ * just establishes the linked list of jobs, EXCEPT:
+ *
+ * - A job's dependencies must appear earlier in the linked list (job chain).
+ *
+ * Justification for each rule:
+ *
+ * - Write value jobs are used to write a zero into a magic tiling field, which
+ * enables tiling to work. If tiling occurs, they are needed; if it does not,
+ * we cannot emit them since then tiling partially occurs and it's bad.
+ *
+ * - The hardware has no notion of a "vertex/tiler job" (at least not our
+ * hardware -- other revs have fused jobs, but --- crap, this just got even
+ * more complicated). They are independent units that take in data, process
+ * it, and spit out data.
+ *
+ * - Any job must depend on its data source, in fact, or risk a
+ * read-before-write hazard. Tiler jobs get their data from vertex jobs, ergo
+ * tiler jobs depend on the corresponding vertex job (if it's there).
+ *
+ * - The tiler is not thread-safe; this dependency prevents race conditions
+ * between two different jobs trying to write to the tiler outputs at the
+ * same time.
+ *
+ * - Internally, jobs are scoreboarded; the next job fields just form a linked
+ * list to allow the jobs to be read in; the execution order is from
+ * resolving the dependency fields instead.
+ *
+ * - The hardware cannot set a dependency on a job it doesn't know about yet,
+ * and dependencies are processed in-order of the next job fields.
+ *
+ */
+
+/* Generates, uploads, and queues a a new job. All fields are written in order
+ * except for next_job accounting (TODO: Should we be clever and defer the
+ * upload of the header here until next job to keep the access pattern totally
+ * linear? Or is that just a micro op at this point?). Returns the generated
+ * index for dep management.
+ *
+ * Inject is used to inject a job at the front, for wallpapering. If you are
+ * not wallpapering and set this, dragons will eat you. */
+
+unsigned
+panfrost_new_job(
+ struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ enum mali_job_type type,
+ bool barrier,
+ unsigned local_dep,
+ void *payload, size_t payload_size,
+ bool inject)
+{
+ unsigned global_dep = 0;
+
+ if (type == JOB_TYPE_TILER) {
+ /* Tiler jobs must be chained, and on Midgard, the first tiler
+ * job must depend on the write value job, whose index we
+ * reserve now */
+
+ if (scoreboard->tiler_dep)
+ global_dep = scoreboard->tiler_dep;
+ else if (!(pool->dev->quirks & IS_BIFROST)) {
+ scoreboard->write_value_index = ++scoreboard->job_index;
+ global_dep = scoreboard->write_value_index;
+ }
+ }
+
+ /* Assign the index */
+ unsigned index = ++scoreboard->job_index;
+
+ struct mali_job_descriptor_header job = {
+ .job_descriptor_size = 1,
+ .job_type = type,
+ .job_barrier = barrier,
+ .job_index = index,
+ .job_dependency_index_1 = local_dep,
+ .job_dependency_index_2 = global_dep,
+ };
+
+ if (inject)
+ job.next_job = scoreboard->first_job;
+
+ struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + payload_size);
+ memcpy(transfer.cpu, &job, sizeof(job));
+ memcpy(transfer.cpu + sizeof(job), payload, payload_size);
+
+ if (inject) {
+ scoreboard->first_job = transfer.gpu;
+ return index;
+ }
+
+ /* Form a chain */
+ if (type == JOB_TYPE_TILER)
+ scoreboard->tiler_dep = index;
+
+ if (scoreboard->prev_job)
+ scoreboard->prev_job->next_job = transfer.gpu;
+ else
+ scoreboard->first_job = transfer.gpu;
+
+ scoreboard->prev_job = (struct mali_job_descriptor_header *) transfer.cpu;
+ return index;
+}
+
+/* Generates a write value job, used to initialize the tiler structures. Note
+ * this is called right before frame submission. */
+
+void
+panfrost_scoreboard_initialize_tiler(struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ mali_ptr polygon_list)
+{
+ /* Check if we even need tiling */
+ if (pool->dev->quirks & IS_BIFROST || !scoreboard->tiler_dep)
+ return;
+
+ /* Okay, we do. Let's generate it. We'll need the job's polygon list
+ * regardless of size. */
+
+ struct mali_job_descriptor_header job = {
+ .job_type = JOB_TYPE_WRITE_VALUE,
+ .job_index = scoreboard->write_value_index,
+ .job_descriptor_size = 1,
+ .next_job = scoreboard->first_job
+ };
+
+ struct mali_payload_write_value payload = {
+ .address = polygon_list,
+ .value_descriptor = MALI_WRITE_VALUE_ZERO,
+ };
+
+ struct panfrost_transfer transfer = panfrost_pool_alloc(pool, sizeof(job) + sizeof(payload));
+ memcpy(transfer.cpu, &job, sizeof(job));
+ memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
+
+ scoreboard->first_job = transfer.gpu;
+}
--- /dev/null
+/*
+ * Copyright (C) 2019-2020 Collabora Ltd.
+ * Copyright (C) 2019 Alyssa Rosenzweig
+ * Copyright (C) 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PAN_SCOREBOARD_H__
+#define __PAN_SCOREBOARD_H__
+
+#include "panfrost-job.h"
+#include "pan_pool.h"
+
+struct pan_scoreboard {
+ /* The first job in the batch */
+ mali_ptr first_job;
+
+ /* The number of jobs in the primary batch, essentially */
+ unsigned job_index;
+
+ /* A CPU-side pointer to the previous job for next_job linking */
+ struct mali_job_descriptor_header *prev_job;
+
+ /* The dependency for tiler jobs (i.e. the index of the last emitted
+ * tiler job, or zero if none have been emitted) */
+ unsigned tiler_dep;
+
+ /* The job index of the WRITE_VALUE job (before it has been created) */
+ unsigned write_value_index;
+};
+
+unsigned
+panfrost_new_job(
+ struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ enum mali_job_type type,
+ bool barrier,
+ unsigned local_dep,
+ void *payload, size_t payload_size,
+ bool inject);
+
+void panfrost_scoreboard_initialize_tiler(
+ struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ mali_ptr polygon_list);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include "util/u_math.h"
+#include "pan_encoder.h"
+
+/* Midgard has a small register file, so shaders with high register pressure
+ * need to spill from the register file onto the stack. In addition to
+ * spilling, it is desireable to allocate temporary arrays on the stack (for
+ * instance because the register file does not support indirect access but the
+ * stack does).
+ *
+ * The stack is located in "Thread Local Storage", sometimes abbreviated TLS in
+ * the kernel source code. Thread local storage is allocated per-thread,
+ * per-core, so threads executing concurrently do not interfere with each
+ * other's stacks. On modern kernels, we may query
+ * DRM_PANFROST_PARAM_THREAD_TLS_ALLOC for the number of threads per core we
+ * must allocate for, and DRM_PANFROST_PARAM_SHADER_PRESENT for a bitmask of
+ * shader cores (so take a popcount of that mask for the number of shader
+ * cores). On older kernels that do not support querying these values,
+ * following kbase, we may use the worst-case value of 256 threads for
+ * THREAD_TLS_ALLOC, and the worst-case value of 16 cores for Midgard per the
+ * "shader core count" column of the implementations table in
+ * https://en.wikipedia.org/wiki/Mali_%28GPU% [citation needed]
+ *
+ * Within a particular thread, there is stack allocated. If it is present, its
+ * size is a power-of-two, and it is at least 16 bytes. Stack is allocated
+ * with the shared memory descriptor used for all shaders within a frame (note
+ * that they don't execute concurrently so it's fine). So, consider the maximum
+ * stack size used by any shader within a job, and then compute (where npot
+ * denotes the next power of two):
+ *
+ * bytes/thread = npot(max(size, 16))
+ * allocated = (# of bytes/thread) * (# of threads/core) * (# of cores)
+ *
+ * The size of Thread Local Storage is signaled to the GPU in a dedicated
+ * log_stack_size field. Since stack sizes are powers of two, it follows that
+ * stack_size is logarithmic. Consider some sample values:
+ *
+ * stack size | log_stack_size
+ * ---------------------------
+ * 256 | 4
+ * 512 | 5
+ * 1024 | 6
+ *
+ * Noting that log2(256) = 8, we have the relation:
+ *
+ * stack_size <= 2^(log_stack_size + 4)
+ *
+ * Given the constraints about powers-of-two and the minimum of 256, we thus
+ * derive a formula for log_stack_size in terms of stack size (s), where s is
+ * positive:
+ *
+ * log_stack_size = ceil(log2(max(s, 16))) - 4
+ *
+ * There are other valid characterisations of this formula, of course, but this
+ * is computationally simple, so good enough for our purposes. If s=0, since
+ * there is no spilling used whatsoever, we may set log_stack_size to 0 to
+ * disable the stack.
+ */
+
+/* Computes log_stack_size = ceil(log2(max(s, 16))) - 4 */
+
+unsigned
+panfrost_get_stack_shift(unsigned stack_size)
+{
+ if (stack_size)
+ return util_logbase2_ceil(MAX2(stack_size, 16)) - 4;
+ else
+ return 0;
+}
+
+/* Computes the aligned stack size given the shift and thread count. The blob
+ * reserves an extra page, and since this is hardware-internal, we do too. */
+
+unsigned
+panfrost_get_total_stack_size(
+ unsigned stack_shift,
+ unsigned threads_per_core,
+ unsigned core_count)
+{
+ unsigned size_per_thread = MAX2(1 << (stack_shift + 4), 32);
+ unsigned size = size_per_thread * threads_per_core * core_count;
+
+ return size + 4096;
+}
--- /dev/null
+/*
+ * Copyright (C) 2008 VMware, Inc.
+ * Copyright (C) 2014 Broadcom
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig
+ * Copyright (C) 2019-2020 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include "util/macros.h"
+#include "util/u_math.h"
+#include "pan_texture.h"
+
+/* Generates a texture descriptor. Ideally, descriptors are immutable after the
+ * texture is created, so we can keep these hanging around in GPU memory in a
+ * dedicated BO and not have to worry. In practice there are some minor gotchas
+ * with this (the driver sometimes will change the format of a texture on the
+ * fly for compression) but it's fast enough to just regenerate the descriptor
+ * in those cases, rather than monkeypatching at drawtime.
+ *
+ * A texture descriptor consists of a 32-byte mali_texture_descriptor structure
+ * followed by a variable number of pointers. Due to this variance and
+ * potentially large size, we actually upload directly rather than returning
+ * the descriptor. Whether the user does a copy themselves or not is irrelevant
+ * to us here.
+ */
+
+/* Check if we need to set a custom stride by computing the "expected"
+ * stride and comparing it to what the user actually wants. Only applies
+ * to linear textures, since tiled/compressed textures have strict
+ * alignment requirements for their strides as it is */
+
+static bool
+panfrost_needs_explicit_stride(
+ struct panfrost_slice *slices,
+ uint16_t width,
+ unsigned first_level, unsigned last_level,
+ unsigned bytes_per_pixel)
+{
+ for (unsigned l = first_level; l <= last_level; ++l) {
+ unsigned actual = slices[l].stride;
+ unsigned expected = u_minify(width, l) * bytes_per_pixel;
+
+ if (actual != expected)
+ return true;
+ }
+
+ return false;
+}
+
+/* A Scalable Texture Compression (ASTC) corresponds to just a few texture type
+ * in the hardware, but in fact can be parametrized to have various widths and
+ * heights for the so-called "stretch factor". It turns out these parameters
+ * are stuffed in the bottom bits of the payload pointers. This functions
+ * computes these magic stuffing constants based on the ASTC format in use. The
+ * constant in a given dimension is 3-bits, and two are stored side-by-side for
+ * each active dimension.
+ */
+
+static unsigned
+panfrost_astc_stretch(unsigned dim)
+{
+ assert(dim >= 4 && dim <= 12);
+ return MIN2(dim, 11) - 4;
+}
+
+/* Texture addresses are tagged with information about compressed formats.
+ * AFBC uses a bit for whether the colorspace transform is enabled (RGB and
+ * RGBA only).
+ * For ASTC, this is a "stretch factor" encoding the block size. */
+
+static unsigned
+panfrost_compression_tag(
+ const struct util_format_description *desc,
+ enum mali_format format, enum mali_texture_layout layout)
+{
+ if (layout == MALI_TEXTURE_AFBC)
+ return desc->nr_channels >= 3;
+ else if (format == MALI_ASTC_2D_LDR || format == MALI_ASTC_2D_HDR)
+ return (panfrost_astc_stretch(desc->block.height) << 3) |
+ panfrost_astc_stretch(desc->block.width);
+ else
+ return 0;
+}
+
+
+/* Cubemaps have 6 faces as "layers" in between each actual layer. We
+ * need to fix this up. TODO: logic wrong in the asserted out cases ...
+ * can they happen, perhaps from cubemap arrays? */
+
+static void
+panfrost_adjust_cube_dimensions(
+ unsigned *first_face, unsigned *last_face,
+ unsigned *first_layer, unsigned *last_layer)
+{
+ *first_face = *first_layer % 6;
+ *last_face = *last_layer % 6;
+ *first_layer /= 6;
+ *last_layer /= 6;
+
+ assert((*first_layer == *last_layer) || (*first_face == 0 && *last_face == 5));
+}
+
+/* Following the texture descriptor is a number of pointers. How many? */
+
+static unsigned
+panfrost_texture_num_elements(
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ bool is_cube, bool manual_stride)
+{
+ unsigned first_face = 0, last_face = 0;
+
+ if (is_cube) {
+ panfrost_adjust_cube_dimensions(&first_face, &last_face,
+ &first_layer, &last_layer);
+ }
+
+ unsigned levels = 1 + last_level - first_level;
+ unsigned layers = 1 + last_layer - first_layer;
+ unsigned faces = 1 + last_face - first_face;
+ unsigned num_elements = levels * layers * faces * MAX2(nr_samples, 1);
+
+ if (manual_stride)
+ num_elements *= 2;
+
+ return num_elements;
+}
+
+/* Conservative estimate of the size of the texture payload a priori.
+ * Average case, size equal to the actual size. Worst case, off by 2x (if
+ * a manual stride is not needed on a linear texture). Returned value
+ * must be greater than or equal to the actual size, so it's safe to use
+ * as an allocation amount */
+
+unsigned
+panfrost_estimate_texture_payload_size(
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ enum mali_texture_type type, enum mali_texture_layout layout)
+{
+ /* Assume worst case */
+ unsigned manual_stride = (layout == MALI_TEXTURE_LINEAR);
+
+ unsigned elements = panfrost_texture_num_elements(
+ first_level, last_level,
+ first_layer, last_layer,
+ nr_samples,
+ type == MALI_TEX_CUBE, manual_stride);
+
+ return sizeof(mali_ptr) * elements;
+}
+
+/* Bifrost requires a tile stride for tiled textures. This stride is computed
+ * as (16 * bpp * width) assuming there is at least one tile (width >= 16).
+ * Otherwise if height <= 16, the blob puts zero. Interactions with AFBC are
+ * currently unknown.
+ */
+
+static unsigned
+panfrost_nonlinear_stride(enum mali_texture_layout layout,
+ unsigned bytes_per_pixel,
+ unsigned width,
+ unsigned height)
+{
+ if (layout == MALI_TEXTURE_TILED) {
+ return (height <= 16) ? 0 : (16 * bytes_per_pixel * ALIGN_POT(width, 16));
+ } else {
+ unreachable("TODO: AFBC on Bifrost");
+ }
+}
+
+static void
+panfrost_emit_texture_payload(
+ mali_ptr *payload,
+ const struct util_format_description *desc,
+ enum mali_format mali_format,
+ enum mali_texture_type type,
+ enum mali_texture_layout layout,
+ unsigned width, unsigned height,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ unsigned cube_stride,
+ bool manual_stride,
+ mali_ptr base,
+ struct panfrost_slice *slices)
+{
+ base |= panfrost_compression_tag(desc, mali_format, layout);
+
+ /* Inject the addresses in, interleaving array indices, mip levels,
+ * cube faces, and strides in that order */
+
+ unsigned first_face = 0, last_face = 0, face_mult = 1;
+
+ if (type == MALI_TEX_CUBE) {
+ face_mult = 6;
+ panfrost_adjust_cube_dimensions(&first_face, &last_face, &first_layer, &last_layer);
+ }
+
+ nr_samples = MAX2(nr_samples, 1);
+
+ unsigned idx = 0;
+
+ for (unsigned w = first_layer; w <= last_layer; ++w) {
+ for (unsigned l = first_level; l <= last_level; ++l) {
+ for (unsigned f = first_face; f <= last_face; ++f) {
+ for (unsigned s = 0; s < nr_samples; ++s) {
+ payload[idx++] = base + panfrost_texture_offset(
+ slices, type == MALI_TEX_3D,
+ cube_stride, l, w * face_mult + f, s);
+
+ if (manual_stride) {
+ payload[idx++] = (layout == MALI_TEXTURE_LINEAR) ?
+ slices[l].stride :
+ panfrost_nonlinear_stride(layout,
+ MAX2(desc->block.bits / 8, 1),
+ u_minify(width, l),
+ u_minify(height, l));
+ }
+ }
+ }
+ }
+ }
+}
+
+#define MALI_SWIZZLE_R001 \
+ (MALI_CHANNEL_RED << 0) | \
+ (MALI_CHANNEL_ZERO << 3) | \
+ (MALI_CHANNEL_ZERO << 6) | \
+ (MALI_CHANNEL_ONE << 9)
+
+#define MALI_SWIZZLE_A001 \
+ (MALI_CHANNEL_ALPHA << 0) | \
+ (MALI_CHANNEL_ZERO << 3) | \
+ (MALI_CHANNEL_ZERO << 6) | \
+ (MALI_CHANNEL_ONE << 9)
+
+
+void
+panfrost_new_texture(
+ void *out,
+ uint16_t width, uint16_t height,
+ uint16_t depth, uint16_t array_size,
+ enum pipe_format format,
+ enum mali_texture_type type,
+ enum mali_texture_layout layout,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ unsigned cube_stride,
+ unsigned swizzle,
+ mali_ptr base,
+ struct panfrost_slice *slices)
+{
+ const struct util_format_description *desc =
+ util_format_description(format);
+
+ unsigned bytes_per_pixel = util_format_get_blocksize(format);
+
+ enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw;
+ assert(mali_format);
+
+ bool manual_stride = (layout == MALI_TEXTURE_LINEAR)
+ && panfrost_needs_explicit_stride(slices, width,
+ first_level, last_level, bytes_per_pixel);
+
+ struct mali_texture_descriptor descriptor = {
+ .width = MALI_POSITIVE(u_minify(width, first_level)),
+ .height = MALI_POSITIVE(u_minify(height, first_level)),
+ .depth = MALI_POSITIVE(u_minify(depth, first_level)),
+ .array_size = MALI_POSITIVE(array_size),
+ .format = {
+ .swizzle = (format == PIPE_FORMAT_X24S8_UINT) ?
+ MALI_SWIZZLE_A001 :
+ (format == PIPE_FORMAT_S8_UINT) ?
+ MALI_SWIZZLE_R001 :
+ panfrost_translate_swizzle_4(desc->swizzle),
+ .format = mali_format,
+ .srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB),
+ .type = type,
+ .layout = layout,
+ .manual_stride = manual_stride,
+ .unknown2 = 1,
+ },
+ .levels = last_level - first_level,
+ .swizzle = swizzle
+ };
+
+ memcpy(out, &descriptor, sizeof(descriptor));
+
+ mali_ptr *payload = (mali_ptr *) (out + sizeof(struct mali_texture_descriptor));
+ panfrost_emit_texture_payload(
+ payload,
+ desc,
+ mali_format,
+ type,
+ layout,
+ width, height,
+ first_level, last_level,
+ first_layer, last_layer,
+ nr_samples,
+ cube_stride,
+ manual_stride,
+ base,
+ slices);
+}
+
+void
+panfrost_new_texture_bifrost(
+ struct bifrost_texture_descriptor *descriptor,
+ uint16_t width, uint16_t height,
+ uint16_t depth, uint16_t array_size,
+ enum pipe_format format,
+ enum mali_texture_type type,
+ enum mali_texture_layout layout,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ unsigned cube_stride,
+ unsigned swizzle,
+ mali_ptr base,
+ struct panfrost_slice *slices,
+ struct panfrost_bo *payload)
+{
+ const struct util_format_description *desc =
+ util_format_description(format);
+
+ enum mali_format mali_format = panfrost_pipe_format_table[desc->format].hw;
+ assert(mali_format);
+
+ panfrost_emit_texture_payload(
+ (mali_ptr *) payload->cpu,
+ desc,
+ mali_format,
+ type,
+ layout,
+ width, height,
+ first_level, last_level,
+ first_layer, last_layer,
+ nr_samples,
+ cube_stride,
+ true, /* Stride explicit on Bifrost */
+ base,
+ slices);
+
+ descriptor->format_unk = 0x2;
+ descriptor->type = type;
+ descriptor->format = mali_format;
+ descriptor->srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+ descriptor->format_unk3 = 0x0;
+ descriptor->width = MALI_POSITIVE(u_minify(width, first_level));
+ descriptor->height = MALI_POSITIVE(u_minify(height, first_level));
+ descriptor->swizzle = swizzle;
+ descriptor->layout = layout;
+ descriptor->levels = last_level - first_level;
+ descriptor->unk1 = 0x0;
+ descriptor->levels_unk = 0;
+ descriptor->level_2 = last_level - first_level;
+ descriptor->payload = payload->gpu;
+ descriptor->array_size = MALI_POSITIVE(array_size);
+ descriptor->unk4 = 0x0;
+ descriptor->depth = MALI_POSITIVE(u_minify(depth, first_level));
+ descriptor->unk5 = 0x0;
+}
+
+/* Computes sizes for checksumming, which is 8 bytes per 16x16 tile.
+ * Checksumming is believed to be a CRC variant (CRC64 based on the size?).
+ * This feature is also known as "transaction elimination". */
+
+#define CHECKSUM_TILE_WIDTH 16
+#define CHECKSUM_TILE_HEIGHT 16
+#define CHECKSUM_BYTES_PER_TILE 8
+
+unsigned
+panfrost_compute_checksum_size(
+ struct panfrost_slice *slice,
+ unsigned width,
+ unsigned height)
+{
+ unsigned aligned_width = ALIGN_POT(width, CHECKSUM_TILE_WIDTH);
+ unsigned aligned_height = ALIGN_POT(height, CHECKSUM_TILE_HEIGHT);
+
+ unsigned tile_count_x = aligned_width / CHECKSUM_TILE_WIDTH;
+ unsigned tile_count_y = aligned_height / CHECKSUM_TILE_HEIGHT;
+
+ slice->checksum_stride = tile_count_x * CHECKSUM_BYTES_PER_TILE;
+
+ return slice->checksum_stride * tile_count_y;
+}
+
+unsigned
+panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level)
+{
+ return is_3d ? slices[level].size0 : cube_stride;
+}
+
+/* Computes the offset into a texture at a particular level/face. Add to
+ * the base address of a texture to get the address to that level/face */
+
+unsigned
+panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample)
+{
+ unsigned layer_stride = panfrost_get_layer_stride(slices, is_3d, cube_stride, level);
+ return slices[level].offset + (face * layer_stride) + (sample * slices[level].size0);
+}
--- /dev/null
+/*
+ * Copyright (C) 2008 VMware, Inc.
+ * Copyright (C) 2014 Broadcom
+ * Copyright (C) 2018-2019 Alyssa Rosenzweig
+ * Copyright (C) 2019-2020 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __PAN_TEXTURE_H
+#define __PAN_TEXTURE_H
+
+#include <stdbool.h>
+#include "util/format/u_format.h"
+#include "compiler/shader_enums.h"
+#include "panfrost-job.h"
+#include "pan_bo.h"
+
+struct panfrost_slice {
+ unsigned offset;
+ unsigned stride;
+ unsigned size0;
+
+ /* If there is a header preceding each slice, how big is
+ * that header? Used for AFBC */
+ unsigned header_size;
+
+ /* If checksumming is enabled following the slice, what
+ * is its offset/stride? */
+ unsigned checksum_offset;
+ unsigned checksum_stride;
+ struct panfrost_bo *checksum_bo;
+
+ /* Has anything been written to this slice? */
+ bool initialized;
+};
+
+struct pan_image {
+ /* Format and size */
+ uint16_t width0, height0, depth0, array_size;
+ enum pipe_format format;
+ enum mali_texture_type type;
+ unsigned first_level, last_level;
+ unsigned first_layer, last_layer;
+ unsigned nr_samples;
+ struct panfrost_bo *bo;
+ struct panfrost_slice *slices;
+ unsigned cubemap_stride;
+ enum mali_texture_layout layout;
+};
+
+unsigned
+panfrost_compute_checksum_size(
+ struct panfrost_slice *slice,
+ unsigned width,
+ unsigned height);
+
+/* AFBC */
+
+bool
+panfrost_format_supports_afbc(enum pipe_format format);
+
+unsigned
+panfrost_afbc_header_size(unsigned width, unsigned height);
+
+/* mali_texture_descriptor */
+
+unsigned
+panfrost_estimate_texture_payload_size(
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ enum mali_texture_type type, enum mali_texture_layout layout);
+
+void
+panfrost_new_texture(
+ void *out,
+ uint16_t width, uint16_t height,
+ uint16_t depth, uint16_t array_size,
+ enum pipe_format format,
+ enum mali_texture_type type,
+ enum mali_texture_layout layout,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ unsigned cube_stride,
+ unsigned swizzle,
+ mali_ptr base,
+ struct panfrost_slice *slices);
+
+void
+panfrost_new_texture_bifrost(
+ struct bifrost_texture_descriptor *descriptor,
+ uint16_t width, uint16_t height,
+ uint16_t depth, uint16_t array_size,
+ enum pipe_format format,
+ enum mali_texture_type type,
+ enum mali_texture_layout layout,
+ unsigned first_level, unsigned last_level,
+ unsigned first_layer, unsigned last_layer,
+ unsigned nr_samples,
+ unsigned cube_stride,
+ unsigned swizzle,
+ mali_ptr base,
+ struct panfrost_slice *slices,
+ struct panfrost_bo *payload);
+
+
+unsigned
+panfrost_get_layer_stride(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level);
+
+unsigned
+panfrost_texture_offset(struct panfrost_slice *slices, bool is_3d, unsigned cube_stride, unsigned level, unsigned face, unsigned sample);
+
+/* Formats */
+
+struct panfrost_format {
+ enum mali_format hw;
+ unsigned bind;
+};
+
+extern struct panfrost_format panfrost_pipe_format_table[PIPE_FORMAT_COUNT];
+
+bool
+panfrost_is_z24s8_variant(enum pipe_format fmt);
+
+unsigned
+panfrost_translate_swizzle_4(const unsigned char swizzle[4]);
+
+void
+panfrost_invert_swizzle(const unsigned char *in, unsigned char *out);
+
+static inline unsigned
+panfrost_get_default_swizzle(unsigned components)
+{
+ switch (components) {
+ case 1:
+ return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_ZERO << 3) |
+ (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9);
+ case 2:
+ return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) |
+ (MALI_CHANNEL_ZERO << 6) | (MALI_CHANNEL_ONE << 9);
+ case 3:
+ return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) |
+ (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ONE << 9);
+ case 4:
+ return (MALI_CHANNEL_RED << 0) | (MALI_CHANNEL_GREEN << 3) |
+ (MALI_CHANNEL_BLUE << 6) | (MALI_CHANNEL_ALPHA << 9);
+ default:
+ unreachable("Invalid number of components");
+ }
+}
+
+static inline unsigned
+panfrost_bifrost_swizzle(unsigned components)
+{
+ /* Set all components to 0 and force w if needed */
+ return components < 4 ? 0x10 : 0x00;
+}
+
+enum mali_format
+panfrost_format_to_bifrost_blend(const struct util_format_description *desc);
+
+struct pan_pool;
+struct pan_scoreboard;
+
+void
+panfrost_init_blit_shaders(struct panfrost_device *dev);
+
+void
+panfrost_load_midg(
+ struct pan_pool *pool,
+ struct pan_scoreboard *scoreboard,
+ mali_ptr blend_shader,
+ mali_ptr fbd,
+ mali_ptr coordinates, unsigned vertex_count,
+ struct pan_image *image,
+ unsigned loc);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2019 Collabora, Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
+ */
+
+#include "util/u_math.h"
+#include "util/macros.h"
+#include "pan_encoder.h"
+
+/* Mali GPUs are tiled-mode renderers, rather than immediate-mode.
+ * Conceptually, the screen is divided into 16x16 tiles. Vertex shaders run.
+ * Then, a fixed-function hardware block (the tiler) consumes the gl_Position
+ * results. For each triangle specified, it marks each containing tile as
+ * containing that triangle. This set of "triangles per tile" form the "polygon
+ * list". Finally, the rasterization unit consumes the polygon list to invoke
+ * the fragment shader.
+ *
+ * In practice, it's a bit more complicated than this. On Midgard chips with an
+ * "advanced tiling unit" (all except T720/T820/T830), 16x16 is the logical
+ * tile size, but Midgard features "hierarchical tiling", where power-of-two
+ * multiples of the base tile size can be used: hierarchy level 0 (16x16),
+ * level 1 (32x32), level 2 (64x64), per public information about Midgard's
+ * tiling. In fact, tiling goes up to 4096x4096 (!), although in practice
+ * 128x128 is the largest usually used (though higher modes are enabled). The
+ * idea behind hierarchical tiling is to use low tiling levels for small
+ * triangles and high levels for large triangles, to minimize memory bandwidth
+ * and repeated fragment shader invocations (the former issue inherent to
+ * immediate-mode rendering and the latter common in traditional tilers).
+ *
+ * The tiler itself works by reading varyings in and writing a polygon list
+ * out. Unfortunately (for us), both of these buffers are managed in main
+ * memory; although they ideally will be cached, it is the drivers'
+ * responsibility to allocate these buffers. Varying buffer allocation is
+ * handled elsewhere, as it is not tiler specific; the real issue is allocating
+ * the polygon list.
+ *
+ * This is hard, because from the driver's perspective, we have no information
+ * about what geometry will actually look like on screen; that information is
+ * only gained from running the vertex shader. (Theoretically, we could run the
+ * vertex shaders in software as a prepass, or in hardware with transform
+ * feedback as a prepass, but either idea is ludicrous on so many levels).
+ *
+ * Instead, Mali uses a bit of a hybrid approach, splitting the polygon list
+ * into three distinct pieces. First, the driver statically determines which
+ * tile hierarchy levels to use (more on that later). At this point, we know the
+ * framebuffer dimensions and all the possible tilings of the framebuffer, so
+ * we know exactly how many tiles exist across all hierarchy levels. The first
+ * piece of the polygon list is the header, which is exactly 8 bytes per tile,
+ * plus padding and a small 64-byte prologue. (If that doesn't remind you of
+ * AFBC, it should. See pan_afbc.c for some fun parallels). The next part is
+ * the polygon list body, which seems to contain 512 bytes per tile, again
+ * across every level of the hierarchy. These two parts form the polygon list
+ * buffer. This buffer has a statically determinable size, approximately equal
+ * to the # of tiles across all hierarchy levels * (8 bytes + 512 bytes), plus
+ * alignment / minimum restrictions / etc.
+ *
+ * The third piece is the easy one (for us): the tiler heap. In essence, the
+ * tiler heap is a gigantic slab that's as big as could possibly be necessary
+ * in the worst case imaginable. Just... a gigantic allocation that we give a
+ * start and end pointer to. What's the catch? The tiler heap is lazily
+ * allocated; that is, a huge amount of memory is _reserved_, but only a tiny
+ * bit is actually allocated upfront. The GPU just keeps using the
+ * unallocated-but-reserved portions as it goes along, generating page faults
+ * if it goes beyond the allocation, and then the kernel is instructed to
+ * expand the allocation on page fault (known in the vendor kernel as growable
+ * memory). This is quite a bit of bookkeeping of its own, but that task is
+ * pushed to kernel space and we can mostly ignore it here, just remembering to
+ * set the GROWABLE flag so the kernel actually uses this path rather than
+ * allocating a gigantic amount up front and burning a hole in RAM.
+ *
+ * As far as determining which hierarchy levels to use, the simple answer is
+ * that right now, we don't. In the tiler configuration fields (consistent from
+ * the earliest Midgard's SFBD through the latest Bifrost traces we have),
+ * there is a hierarchy_mask field, controlling which levels (tile sizes) are
+ * enabled. Ideally, the hierarchical tiling dream -- mapping big polygons to
+ * big tiles and small polygons to small tiles -- would be realized here as
+ * well. As long as there are polygons at all needing tiling, we always have to
+ * have big tiles available, in case there are big polygons. But we don't
+ * necessarily need small tiles available. Ideally, when there are small
+ * polygons, small tiles are enabled (to avoid waste from putting small
+ * triangles in the big tiles); when there are not, small tiles are disabled to
+ * avoid enabling more levels than necessary, which potentially costs in memory
+ * bandwidth / power / tiler performance.
+ *
+ * Of course, the driver has to figure this out statically. When tile
+ * hiearchies are actually established, this occurs by the tiler in
+ * fixed-function hardware, after the vertex shaders have run and there is
+ * sufficient information to figure out the size of triangles. The driver has
+ * no such luxury, again barring insane hacks like additionally running the
+ * vertex shaders in software or in hardware via transform feedback. Thus, for
+ * the driver, we need a heuristic approach.
+ *
+ * There are lots of heuristics to guess triangle size statically you could
+ * imagine, but one approach shines as particularly simple-stupid: assume all
+ * on-screen triangles are equal size and spread equidistantly throughout the
+ * screen. Let's be clear, this is NOT A VALID ASSUMPTION. But if we roll with
+ * it, then we see:
+ *
+ * Triangle Area = (Screen Area / # of triangles)
+ * = (Width * Height) / (# of triangles)
+ *
+ * Or if you prefer, we can also make a third CRAZY assumption that we only draw
+ * right triangles with edges parallel/perpendicular to the sides of the screen
+ * with no overdraw, forming a triangle grid across the screen:
+ *
+ * |--w--|
+ * _____ |
+ * | /| /| |
+ * |/_|/_| h
+ * | /| /| |
+ * |/_|/_| |
+ *
+ * Then you can use some middle school geometry and algebra to work out the
+ * triangle dimensions. I started working on this, but realised I didn't need
+ * to to make my point, but couldn't bare to erase that ASCII art. Anyway.
+ *
+ * POINT IS, by considering the ratio of screen area and triangle count, we can
+ * estimate the triangle size. For a small size, use small bins; for a large
+ * size, use large bins. Intuitively, this metric makes sense: when there are
+ * few triangles on a large screen, you're probably compositing a UI and
+ * therefore the triangles are large; when there are a lot of triangles on a
+ * small screen, you're probably rendering a 3D mesh and therefore the
+ * triangles are tiny. (Or better said -- there will be tiny triangles, even if
+ * there are also large triangles. There have to be unless you expect crazy
+ * overdraw. Generally, it's better to allow more small bin sizes than
+ * necessary than not allow enough.)
+ *
+ * From this heuristic (or whatever), we determine the minimum allowable tile
+ * size, and we use that to decide the hierarchy masking, selecting from the
+ * minimum "ideal" tile size to the maximum tile size (2048x2048 in practice).
+ *
+ * Once we have that mask and the framebuffer dimensions, we can compute the
+ * size of the statically-sized polygon list structures, allocate them, and go!
+ *
+ * -----
+ *
+ * On T720, T820, and T830, there is no support for hierarchical tiling.
+ * Instead, the hardware allows the driver to select the tile size dynamically
+ * on a per-framebuffer basis, including allowing rectangular/non-square tiles.
+ * Rules for tile size selection are as follows:
+ *
+ * - Dimensions must be powers-of-two.
+ * - The smallest tile is 16x16.
+ * - The tile width/height is at most the framebuffer w/h (clamp up to 16 pix)
+ * - There must be no more than 64 tiles in either dimension.
+ *
+ * Within these constraints, the driver is free to pick a tile size according
+ * to some heuristic, similar to units with an advanced tiling unit.
+ *
+ * To pick a size without any heuristics, we may satisfy the constraints by
+ * defaulting to 16x16 (a power-of-two). This fits the minimum. For the size
+ * constraint, consider:
+ *
+ * # of tiles < 64
+ * ceil (fb / tile) < 64
+ * (fb / tile) <= (64 - 1)
+ * tile <= fb / (64 - 1) <= next_power_of_two(fb / (64 - 1))
+ *
+ * Hence we clamp up to align_pot(fb / (64 - 1)).
+
+ * Extending to use a selection heuristic left for future work.
+ *
+ * Once the tile size (w, h) is chosen, we compute the hierarchy "mask":
+ *
+ * hierarchy_mask = (log2(h / 16) << 6) | log2(w / 16)
+ *
+ * Of course with no hierarchical tiling, this is not a mask; it's just a field
+ * specifying the tile size. But I digress.
+ *
+ * We also compute the polgon list sizes (with framebuffer size W, H) as:
+ *
+ * full_size = 0x200 + 0x200 * ceil(W / w) * ceil(H / h)
+ * offset = 8 * ceil(W / w) * ceil(H / h)
+ *
+ * It further appears necessary to round down offset to the nearest 0x200.
+ * Possibly we would also round down full_size to the nearest 0x200 but
+ * full_size/0x200 = (1 + ceil(W / w) * ceil(H / h)) is an integer so there's
+ * nothing to do.
+ */
+
+/* Hierarchical tiling spans from 16x16 to 4096x4096 tiles */
+
+#define MIN_TILE_SIZE 16
+#define MAX_TILE_SIZE 4096
+
+/* Constants as shifts for easier power-of-two iteration */
+
+#define MIN_TILE_SHIFT util_logbase2(MIN_TILE_SIZE)
+#define MAX_TILE_SHIFT util_logbase2(MAX_TILE_SIZE)
+
+/* The hierarchy has a 64-byte prologue */
+#define PROLOGUE_SIZE 0x40
+
+/* For each tile (across all hierarchy levels), there is 8 bytes of header */
+#define HEADER_BYTES_PER_TILE 0x8
+
+/* Likewise, each tile per level has 512 bytes of body */
+#define FULL_BYTES_PER_TILE 0x200
+
+/* If the width-x-height framebuffer is divided into tile_size-x-tile_size
+ * tiles, how many tiles are there? Rounding up in each direction. For the
+ * special case of tile_size=16, this aligns with the usual Midgard count.
+ * tile_size must be a power-of-two. Not really repeat code from AFBC/checksum,
+ * because those care about the stride (not just the overall count) and only at
+ * a a fixed-tile size (not any of a number of power-of-twos) */
+
+static unsigned
+pan_tile_count(unsigned width, unsigned height, unsigned tile_width, unsigned tile_height)
+{
+ unsigned aligned_width = ALIGN_POT(width, tile_width);
+ unsigned aligned_height = ALIGN_POT(height, tile_height);
+
+ unsigned tile_count_x = aligned_width / tile_width;
+ unsigned tile_count_y = aligned_height / tile_height;
+
+ return tile_count_x * tile_count_y;
+}
+
+/* For `masked_count` of the smallest tile sizes masked out, computes how the
+ * size of the polygon list header. We iterate the tile sizes (16x16 through
+ * 2048x2048). For each tile size, we figure out how many tiles there are at
+ * this hierarchy level and therefore many bytes this level is, leaving us with
+ * a byte count for each level. We then just sum up the byte counts across the
+ * levels to find a byte count for all levels. */
+
+static unsigned
+panfrost_hierarchy_size(
+ unsigned width,
+ unsigned height,
+ unsigned mask,
+ unsigned bytes_per_tile)
+{
+ unsigned size = PROLOGUE_SIZE;
+
+ /* Iterate hierarchy levels */
+
+ for (unsigned b = 0; b < (MAX_TILE_SHIFT - MIN_TILE_SHIFT); ++b) {
+ /* Check if this level is enabled */
+ if (!(mask & (1 << b)))
+ continue;
+
+ /* Shift from a level to a tile size */
+ unsigned tile_size = (1 << b) * MIN_TILE_SIZE;
+
+ unsigned tile_count = pan_tile_count(width, height, tile_size, tile_size);
+ unsigned level_count = bytes_per_tile * tile_count;
+
+ size += level_count;
+ }
+
+ /* This size will be used as an offset, so ensure it's aligned */
+ return ALIGN_POT(size, 0x200);
+}
+
+/* Implement the formula:
+ *
+ * 0x200 + bytes_per_tile * ceil(W / w) * ceil(H / h)
+ *
+ * rounding down the answer to the nearest 0x200. This is used to compute both
+ * header and body sizes for GPUs without hierarchical tiling. Essentially,
+ * computing a single hierarchy level, since there isn't any hierarchy!
+ */
+
+static unsigned
+panfrost_flat_size(unsigned width, unsigned height, unsigned dim, unsigned bytes_per_tile)
+{
+ /* First, extract the tile dimensions */
+
+ unsigned tw = (1 << (dim & 0b111)) * 8;
+ unsigned th = (1 << ((dim & (0b111 << 6)) >> 6)) * 8;
+
+ /* tile_count is ceil(W/w) * ceil(H/h) */
+ unsigned raw = pan_tile_count(width, height, tw, th) * bytes_per_tile;
+
+ /* Round down and add offset */
+ return 0x200 + ((raw / 0x200) * 0x200);
+}
+
+/* Given a hierarchy mask and a framebuffer size, compute the header size */
+
+unsigned
+panfrost_tiler_header_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
+{
+ if (hierarchy)
+ return panfrost_hierarchy_size(width, height, mask, HEADER_BYTES_PER_TILE);
+ else
+ return panfrost_flat_size(width, height, mask, HEADER_BYTES_PER_TILE);
+}
+
+/* The combined header/body is sized similarly (but it is significantly
+ * larger), except that it can be empty when the tiler disabled, rather than
+ * getting clamped to a minimum size.
+ */
+
+unsigned
+panfrost_tiler_full_size(unsigned width, unsigned height, unsigned mask, bool hierarchy)
+{
+ if (hierarchy)
+ return panfrost_hierarchy_size(width, height, mask, FULL_BYTES_PER_TILE);
+ else
+ return panfrost_flat_size(width, height, mask, FULL_BYTES_PER_TILE);
+}
+
+/* On GPUs without hierarchical tiling, we choose a tile size directly and
+ * stuff it into the field otherwise known as hierarchy mask (not a mask). */
+
+static unsigned
+panfrost_choose_tile_size(
+ unsigned width, unsigned height, unsigned vertex_count)
+{
+ /* Figure out the ideal tile size. Eventually a heuristic should be
+ * used for this */
+
+ unsigned best_w = 16;
+ unsigned best_h = 16;
+
+ /* Clamp so there are less than 64 tiles in each direction */
+
+ best_w = MAX2(best_w, util_next_power_of_two(width / 63));
+ best_h = MAX2(best_h, util_next_power_of_two(height / 63));
+
+ /* We have our ideal tile size, so encode */
+
+ unsigned exp_w = util_logbase2(best_w / 16);
+ unsigned exp_h = util_logbase2(best_h / 16);
+
+ return exp_w | (exp_h << 6);
+}
+
+/* In the future, a heuristic to choose a tiler hierarchy mask would go here.
+ * At the moment, we just default to 0xFF, which enables all possible hierarchy
+ * levels. Overall this yields good performance but presumably incurs a cost in
+ * memory bandwidth / power consumption / etc, at least on smaller scenes that
+ * don't really need all the smaller levels enabled */
+
+unsigned
+panfrost_choose_hierarchy_mask(
+ unsigned width, unsigned height,
+ unsigned vertex_count, bool hierarchy)
+{
+ /* If there is no geometry, we don't bother enabling anything */
+
+ if (!vertex_count)
+ return 0x00;
+
+ if (!hierarchy)
+ return panfrost_choose_tile_size(width, height, vertex_count);
+
+ /* Otherwise, default everything on. TODO: Proper tests */
+
+ return 0xFF;
+}
--- /dev/null
+/**************************************************************************
+ *
+ * Copyright 2019 Collabora, Ltd.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef PAN_UTIL_H
+#define PAN_UTIL_H
+
+#define PAN_DBG_MSGS 0x0001
+#define PAN_DBG_TRACE 0x0002
+#define PAN_DBG_DEQP 0x0004
+#define PAN_DBG_AFBC 0x0008
+#define PAN_DBG_SYNC 0x0010
+#define PAN_DBG_PRECOMPILE 0x0020
+#define PAN_DBG_NOFP16 0x0040
+#define PAN_DBG_BIFROST 0x0080
+#define PAN_DBG_GL3 0x0100
+
+#endif /* PAN_UTIL_H */
])
inc_panfrost = include_directories([
- '.', 'include', 'shared', 'midgard', 'bifrost', 'encoder'
+ '.', 'include', 'shared', 'midgard', 'bifrost', 'lib'
])
subdir('shared')
subdir('midgard')
subdir('bifrost')
subdir('pandecode')
-subdir('encoder')
+subdir('lib')
files_bifrost = files(
'bifrost/cmdline.c',
libglsl_standalone,
libpanfrost_bifrost,
libpanfrost_decode,
- libpanfrost_encoder,
+ libpanfrost_lib,
libpanfrost_midgard, # references disassemble_midgard...
],
build_by_default : with_tools.contains('panfrost')