#define MALI_CULL_FACE_FRONT (1 << 6)
#define MALI_CULL_FACE_BACK (1 << 7)
-/* Flags apply to unknown2_3? */
-
-#define MALI_HAS_MSAA (1 << 0)
-
-/* Execute fragment shader per-sample if set (e.g. to implement gl_SampleID
- * reads) */
-#define MALI_PER_SAMPLE (1 << 2)
-#define MALI_CAN_DISCARD (1 << 5)
-
-/* Applies on SFBD systems, specifying that programmable blending is in use */
-#define MALI_HAS_BLEND_SHADER (1 << 6)
-
-/* func is mali_func */
-#define MALI_DEPTH_FUNC(func) (func << 8)
-#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7)
-#define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7)
-
-#define MALI_DEPTH_WRITEMASK (1 << 11)
-
-#define MALI_DEPTH_CLIP_NEAR (1 << 12)
-#define MALI_DEPTH_CLIP_FAR (1 << 13)
-
-/* Next flags to unknown2_4 */
-#define MALI_STENCIL_TEST (1 << 0)
-
-#define MALI_ALPHA_TO_COVERAGE (1 << 1)
-
-#define MALI_NO_DITHER (1 << 9)
-#define MALI_DEPTH_RANGE_A (1 << 12)
-#define MALI_DEPTH_RANGE_B (1 << 13)
-#define MALI_NO_MSAA (1 << 14)
-
-#define MALI_MASK_R (1 << 0)
-#define MALI_MASK_G (1 << 1)
-#define MALI_MASK_B (1 << 2)
-#define MALI_MASK_A (1 << 3)
-
enum mali_nondominant_mode {
MALI_BLEND_NON_MIRROR = 0,
MALI_BLEND_NON_ZERO = 1
#define MALI_CHANNEL_FLOAT 7
#define MALI_EXTRACT_BITS(fmt) (fmt & 0x7)
-/* Applies to midgard1.flags_lo */
-
-/* Should be set when the fragment shader updates the depth value. */
-#define MALI_WRITES_Z (1 << 4)
-
-/* Should the hardware perform early-Z testing? Set if the shader does not use
- * discard, alpha-to-coverage, shader depth writes, and if the shader has no
- * side effects (writes to global memory or images) unless early-z testing is
- * forced in the shader.
- */
-
-#define MALI_EARLY_Z (1 << 6)
-
-/* Should the hardware calculate derivatives (via helper invocations)? Set in a
- * fragment shader that uses texturing or derivative functions */
-
-#define MALI_HELPER_INVOCATIONS (1 << 7)
-
-/* Flags denoting the fragment shader's use of tilebuffer readback. If the
- * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If
- * it might read depth/stencil in particular, also set MALI_READS_ZS */
-
-#define MALI_READS_ZS (1 << 8)
-
-/* The shader might write to global memory (via OpenCL, SSBOs, or images).
- * Reading is okay, as are ordinary writes to the tilebuffer/varyings. Setting
- * incurs a performance penalty. On a fragment shader, this bit implies there
- * are side effects, hence it interacts with early-z. */
-#define MALI_WRITES_GLOBAL (1 << 9)
-
-#define MALI_READS_TILEBUFFER (1 << 10)
-
-/* Applies to midgard1.flags_hi */
-
-/* Should be set when the fragment shader updates the stencil value. */
-#define MALI_WRITES_S (1 << 2)
-
-/* Mode to suppress generation of Infinity and NaN values by clamping inf
- * (-inf) to MAX_FLOAT (-MIN_FLOAT) and flushing NaN to 0.0
- *
- * Compare suppress_inf/suppress_nan flags on the Bifrost clause header for the
- * same functionality.
- *
- * This is not conformant on GLES3 or OpenCL, but is optional on GLES2, where
- * it works around app bugs (e.g. in glmark2-es2 -bterrain with FP16).
- */
-#define MALI_SUPPRESS_INF_NAN (1 << 3)
-
-/* Flags for bifrost1.unk1 */
-
-/* Shader uses less than 32 registers, partitioned as [R0, R15] U [R48, R63],
- * allowing for full thread count. If clear, the full [R0, R63] register set is
- * available at half thread count */
-#define MALI_BIFROST_FULL_THREAD (1 << 9)
-
-/* Enable early-z testing (presumably). This flag may not be set if the shader:
- *
- * - Uses blending
- * - Uses discard
- * - Writes gl_FragDepth
- *
- * This differs from Midgard which sets the MALI_EARLY_Z flag even with
- * blending, although I've begun to suspect that flag does not in fact enable
- * EARLY_Z alone. */
-#define MALI_BIFROST_EARLY_Z (1 << 15)
-
-/* First clause type is ATEST */
-#define MALI_BIFROST_FIRST_ATEST (1 << 26)
-
/* The raw Midgard blend payload can either be an equation or a shader
* address, depending on the context */
};
} __attribute__((packed));
-/* Descriptor for the shader. Following this is at least one, up to four blend
- * descriptors for each active render target */
-
-struct mali_shader_meta {
- mali_ptr shader;
- u16 sampler_count;
- u16 texture_count;
- u16 attribute_count;
- u16 varying_count;
-
- union {
- struct {
- u32 uniform_buffer_count : 4;
- u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler
- } bifrost1;
- struct {
- unsigned uniform_buffer_count : 4;
- unsigned flags_lo : 12;
-
- /* vec4 units */
- unsigned work_count : 5;
- unsigned uniform_count : 5;
- unsigned flags_hi : 6;
- } midgard1;
- };
-
- /* Same as glPolygoOffset() arguments */
- float depth_units;
- float depth_factor;
-
- u32 unknown2_2;
-
- /* Generated from SAMPLE_COVERAGE_VALUE and SAMPLE_COVERAGE_INVERT. See
- * 13.8.3 ("Multisample Fragment Operations") in the OpenGL ES 3.2
- * specification. Only matters when multisampling is enabled. */
- u16 coverage_mask;
-
- u16 unknown2_3;
-
- u8 stencil_mask_front;
- u8 stencil_mask_back;
- u16 unknown2_4;
-
- struct mali_stencil_packed stencil_front;
- struct mali_stencil_packed stencil_back;
-
- union {
- struct {
- u32 unk3 : 7;
- /* On Bifrost, some system values are preloaded in
- * registers R55-R62 by the thread dispatcher prior to
- * the start of shader execution. This is a bitfield
- * with one entry for each register saying which
- * registers need to be preloaded. Right now, the known
- * values are:
- *
- * Vertex/compute:
- * - R55 : gl_LocalInvocationID.xy
- * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits
- * - R57 : gl_WorkGroupID.x
- * - R58 : gl_WorkGroupID.y
- * - R59 : gl_WorkGroupID.z
- * - R60 : gl_GlobalInvocationID.x
- * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base)
- * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base)
- *
- * Fragment:
- * - R55 : unknown, never seen (but the bit for this is
- * always set?)
- * - R56 : unknown (bit always unset)
- * - R57 : gl_PrimitiveID
- * - R58 : gl_FrontFacing in low bit, potentially other stuff
- * - R59 : u16 fragment coordinates (used to compute
- * gl_FragCoord.xy, together with sample positions)
- * - R60 : gl_SampleMask (used in epilog, so pretty
- * much always used, but the bit is always 0 -- is
- * this just always pushed?)
- * - R61 : gl_SampleMaskIn and gl_SampleID, used by
- * varying interpolation.
- * - R62 : unknown (bit always unset).
- *
- * Later GPUs (starting with Mali-G52?) support
- * preloading float varyings into r0-r7. This is
- * indicated by setting 0x40. There is no distinction
- * here between 1 varying and 2.
- */
- u32 preload_regs : 8;
- /* In units of 8 bytes or 64 bits, since the
- * uniform/const port loads 64 bits at a time.
- */
- u32 uniform_count : 7;
- u32 unk4 : 10; // = 2
- } bifrost2;
- struct {
- u32 unknown2_7;
- } midgard2;
- };
-
- u32 padding;
-
- /* Blending information for the older non-MRT Midgard HW. Check for
- * MALI_HAS_BLEND_SHADER to decide how to interpret.
- */
-
- union midgard_blend blend;
-} __attribute__((packed));
-
-/* This only concerns hardware jobs */
-
/* Possible values for job_descriptor_size */
#define MALI_JOB_32 0
* fused payloads.
*/
-/* Applies to unknown_draw */
-
-#define MALI_DRAW_INDEXED_UINT8 (0x10)
-#define MALI_DRAW_INDEXED_UINT16 (0x20)
-#define MALI_DRAW_INDEXED_UINT32 (0x30)
-#define MALI_DRAW_INDEXED_SIZE (0x30)
-#define MALI_DRAW_INDEXED_SHIFT (4)
-
-#define MALI_DRAW_VARYING_SIZE (0x100)
-
-/* Set to use first vertex as the provoking vertex for flatshading. Clear to
- * use the last vertex. This is the default in DX and VK, but not in GL. */
-
-#define MALI_DRAW_FLATSHADE_FIRST (0x800)
-
-#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000)
-
struct mali_vertex_tiler_prefix {
- /* This is a dynamic bitfield containing the following things in this order:
- *
- * - gl_WorkGroupSize.x
- * - gl_WorkGroupSize.y
- * - gl_WorkGroupSize.z
- * - gl_NumWorkGroups.x
- * - gl_NumWorkGroups.y
- * - gl_NumWorkGroups.z
- *
- * The number of bits allocated for each number is based on the *_shift
- * fields below. For example, workgroups_y_shift gives the bit that
- * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit
- * that gl_NumWorkGroups.z starts at (and therefore one after the bit
- * that gl_NumWorkGroups.y ends at). The actual value for each gl_*
- * value is one more than the stored value, since if any of the values
- * are zero, then there would be no invocations (and hence no job). If
- * there were 0 bits allocated to a given field, then it must be zero,
- * and hence the real value is one.
- *
- * Vertex jobs reuse the same job dispatch mechanism as compute jobs,
- * effectively doing glDispatchCompute(1, vertex_count, instance_count)
- * where vertex count is the number of vertices.
- */
- u32 invocation_count;
-
- /* Bitfield for shifts:
- *
- * size_y_shift : 5
- * size_z_shift : 5
- * workgroups_x_shift : 6
- * workgroups_y_shift : 6
- * workgroups_z_shift : 6
- * workgroups_x_shift_2 : 4
- */
- u32 invocation_shifts;
-
- u32 draw_mode : 4;
- u32 unknown_draw : 22;
-
- /* This is the the same as workgroups_x_shift_2 in compute shaders, but
- * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has
- * something to do with how many quads get put in the same execution
- * engine, which is a balance (you don't want to starve the engine, but
- * you also want to distribute work evenly).
- */
- u32 workgroups_x_shift_3 : 6;
-
-
- /* Negative of min_index. This is used to compute
- * the unbiased index in tiler/fragment shader runs.
- *
- * The hardware adds offset_bias_correction in each run,
- * so that absent an index bias, the first vertex processed is
- * genuinely the first vertex (0). But with an index bias,
- * the first vertex process is numbered the same as the bias.
- *
- * To represent this more conviniently:
- * unbiased_index = lower_bound_index +
- * index_bias +
- * offset_bias_correction
- *
- * This is done since the hardware doesn't accept a index_bias
- * and this allows it to recover the unbiased index.
- */
- int32_t offset_bias_correction;
- u32 zero1;
-
- /* Like many other strictly nonzero quantities, index_count is
- * subtracted by one. For an indexed cube, this is equal to 35 = 6
- * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is,
- * for an indexed draw, index_count is the number of actual vertices
- * rendered whereas invocation_count is the number of unique vertices
- * rendered (the number of times the vertex shader must be invoked).
- * For non-indexed draws, this is just equal to invocation_count. */
-
- u32 index_count;
-
- /* No hidden structure; literally just a pointer to an array of uint
- * indices (width depends on flags). Thanks, guys, for not making my
- * life insane for once! NULL for non-indexed draws. */
-
- u64 indices;
+ struct mali_invocation_packed invocation;
+ struct mali_primitive_packed primitive;
} __attribute__((packed));
/* Point size / line width can either be specified as a 32-bit float (for
u64 zeros[20];
} __attribute__((packed));
-struct bifrost_tiler_only {
- /* 0x20 */
- union midgard_primitive_size primitive_size;
-
- mali_ptr tiler_meta;
-
- u64 zero1, zero2, zero3, zero4, zero5, zero6;
-} __attribute__((packed));
-
struct mali_vertex_tiler_postfix {
u16 gl_enables; // 0x6 on Midgard, 0x2 on Bifrost
struct bifrost_payload_tiler {
struct mali_vertex_tiler_prefix prefix;
- struct bifrost_tiler_only tiler;
+ union midgard_primitive_size primitive_size;
+ mali_ptr tiler_meta;
+ u64 zero1, zero2, zero3, zero4, zero5, zero6;
struct mali_vertex_tiler_postfix postfix;
} __attribute__((packed));
-struct bifrost_payload_fused {
- struct mali_vertex_tiler_prefix prefix;
- struct bifrost_tiler_only tiler;
- struct mali_vertex_tiler_postfix tiler_postfix;
- u64 padding; /* zero */
- struct mali_vertex_tiler_postfix vertex_postfix;
-} __attribute__((packed));
-
/* Purposeful off-by-one in width, height fields. For example, a (64, 64)
* texture is stored as (63, 63) in these fields. This adjusts for that.
* There's an identical pattern in the framebuffer descriptor. Even vertex