From: Alyssa Rosenzweig Date: Thu, 12 Dec 2019 16:28:08 +0000 (-0500) Subject: panfrost: Pack invocation_shifts manually instead of a bit field X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6378797a6d1ce3652d0394beeb1af33af7426ed2;p=mesa.git panfrost: Pack invocation_shifts manually instead of a bit field gcc generates exceptionally bad code for panfrost_pack_work_groups_fused otherwise ... although that routine is somehow still hot ... Signed-off-by: Alyssa Rosenzweig Part-of: --- diff --git a/src/panfrost/encoder/pan_invocation.c b/src/panfrost/encoder/pan_invocation.c index 8fb1669c9ed..ecde3da4648 100644 --- a/src/panfrost/encoder/pan_invocation.c +++ b/src/panfrost/encoder/pan_invocation.c @@ -91,33 +91,38 @@ panfrost_pack_work_groups_compute( shifts[i + 1] = shifts[i] + bit_count; } - /* We're packed, so upload everything */ - out->invocation_count = packed; - out->size_y_shift = shifts[1]; - out->size_z_shift = shifts[2]; - out->workgroups_x_shift = shifts[3]; - out->workgroups_y_shift = shifts[4]; - out->workgroups_z_shift = shifts[5]; - /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift * = 32. This doesn't appear to matter to the hardware, but it's good * to be bit-identical. */ if (quirk_graphics && (num_z <= 1)) - out->workgroups_z_shift = 32; + shifts[5] = 32; /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2, * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL * compute, it seems it might *always* be 2, but this is suspicious and * needs further investigation. (I'm probably just using GL wrong). */ + unsigned shift_2 = shifts[3]; + if (quirk_graphics) - out->workgroups_x_shift_2 = MAX2(out->workgroups_x_shift, 2); - else - out->workgroups_x_shift_2 = out->workgroups_x_shift; + shift_2 = MAX2(shift_2, 2); + + /* Pack them in */ + uint32_t packed_shifts = + (shifts[1] << 0) | + (shifts[2] << 5) | + (shifts[3] << 10) | + (shifts[4] << 16) | + (shifts[5] << 22) | + (shift_2 << 28); + + /* Upload the packed bitfields */ + out->invocation_count = packed; + out->invocation_shifts = packed_shifts; /* TODO: Compute workgroups_x_shift_3 */ - out->workgroups_x_shift_3 = out->workgroups_x_shift_2; + out->workgroups_x_shift_3 = shift_2; } /* Packs vertex/tiler descriptors simultaneously */ @@ -136,12 +141,7 @@ panfrost_pack_work_groups_fused( /* Copy results over */ tiler->invocation_count = vertex->invocation_count; - tiler->size_y_shift = vertex->size_y_shift; - tiler->size_z_shift = vertex->size_z_shift; - tiler->workgroups_x_shift = vertex->workgroups_x_shift; - tiler->workgroups_x_shift_2 = vertex->workgroups_x_shift_2; - tiler->workgroups_y_shift = vertex->workgroups_y_shift; - tiler->workgroups_z_shift = vertex->workgroups_z_shift; + tiler->invocation_shifts = vertex->invocation_shifts; /* Set special fields for each */ vertex->workgroups_x_shift_3 = 5; diff --git a/src/panfrost/include/panfrost-job.h b/src/panfrost/include/panfrost-job.h index 15c22d8fa09..7bf23b09836 100644 --- a/src/panfrost/include/panfrost-job.h +++ b/src/panfrost/include/panfrost-job.h @@ -937,13 +937,16 @@ struct mali_vertex_tiler_prefix { */ u32 invocation_count; - u32 size_y_shift : 5; - u32 size_z_shift : 5; - u32 workgroups_x_shift : 6; - u32 workgroups_y_shift : 6; - u32 workgroups_z_shift : 6; - /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */ - u32 workgroups_x_shift_2 : 4; + /* Bitfield for shifts: + * + * size_y_shift : 5 + * size_z_shift : 5 + * workgroups_x_shift : 6 + * workgroups_y_shift : 6 + * workgroups_z_shift : 6 + * workgroups_x_shift_2 : 4 + */ + u32 invocation_shifts; u32 draw_mode : 4; u32 unknown_draw : 22; diff --git a/src/panfrost/pandecode/decode.c b/src/panfrost/pandecode/decode.c index b24ce3f65d1..0b25e354735 100644 --- a/src/panfrost/pandecode/decode.c +++ b/src/panfrost/pandecode/decode.c @@ -1674,13 +1674,20 @@ pandecode_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no, bo * invocation_count for an explanation. */ - unsigned size_x = bits(p->invocation_count, 0, p->size_y_shift) + 1; - unsigned size_y = bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1; - unsigned size_z = bits(p->invocation_count, p->size_z_shift, p->workgroups_x_shift) + 1; + unsigned size_y_shift = bits(p->invocation_shifts, 0, 5); + unsigned size_z_shift = bits(p->invocation_shifts, 5, 10); + unsigned workgroups_x_shift = bits(p->invocation_shifts, 10, 16); + unsigned workgroups_y_shift = bits(p->invocation_shifts, 16, 22); + unsigned workgroups_z_shift = bits(p->invocation_shifts, 22, 28); + unsigned workgroups_x_shift_2 = bits(p->invocation_shifts, 28, 32); - unsigned groups_x = bits(p->invocation_count, p->workgroups_x_shift, p->workgroups_y_shift) + 1; - unsigned groups_y = bits(p->invocation_count, p->workgroups_y_shift, p->workgroups_z_shift) + 1; - unsigned groups_z = bits(p->invocation_count, p->workgroups_z_shift, 32) + 1; + unsigned size_x = bits(p->invocation_count, 0, size_y_shift) + 1; + unsigned size_y = bits(p->invocation_count, size_y_shift, size_z_shift) + 1; + unsigned size_z = bits(p->invocation_count, size_z_shift, workgroups_x_shift) + 1; + + unsigned groups_x = bits(p->invocation_count, workgroups_x_shift, workgroups_y_shift) + 1; + unsigned groups_y = bits(p->invocation_count, workgroups_y_shift, workgroups_z_shift) + 1; + unsigned groups_z = bits(p->invocation_count, workgroups_z_shift, 32) + 1; /* Even though we have this decoded, we want to ensure that the * representation is "unique" so we don't lose anything by printing only @@ -1695,31 +1702,21 @@ pandecode_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no, bo bool canonical = (p->invocation_count == ref.invocation_count) && - (p->size_y_shift == ref.size_y_shift) && - (p->size_z_shift == ref.size_z_shift) && - (p->workgroups_x_shift == ref.workgroups_x_shift) && - (p->workgroups_y_shift == ref.workgroups_y_shift) && - (p->workgroups_z_shift == ref.workgroups_z_shift) && - (p->workgroups_x_shift_2 == ref.workgroups_x_shift_2); + (p->invocation_shifts == ref.invocation_shifts); if (!canonical) { pandecode_msg("XXX: non-canonical workgroups packing\n"); - pandecode_msg("expected: %X, %d, %d, %d, %d, %d, %d\n", + pandecode_msg("expected: %X, %X", ref.invocation_count, - ref.size_y_shift, - ref.size_z_shift, - ref.workgroups_x_shift, - ref.workgroups_y_shift, - ref.workgroups_z_shift, - ref.workgroups_x_shift_2); + ref.invocation_shifts); pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count); - pandecode_prop("size_y_shift = %d", p->size_y_shift); - pandecode_prop("size_z_shift = %d", p->size_z_shift); - pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift); - pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift); - pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift); - pandecode_prop("workgroups_x_shift_2 = %d", p->workgroups_x_shift_2); + pandecode_prop("size_y_shift = %d", size_y_shift); + pandecode_prop("size_z_shift = %d", size_z_shift); + pandecode_prop("workgroups_x_shift = %d", workgroups_x_shift); + pandecode_prop("workgroups_y_shift = %d", workgroups_y_shift); + pandecode_prop("workgroups_z_shift = %d", workgroups_z_shift); + pandecode_prop("workgroups_x_shift_2 = %d", workgroups_x_shift_2); } /* Regardless, print the decode */