panfrost: Pack invocation_shifts manually instead of a bit field
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Thu, 12 Dec 2019 16:28:08 +0000 (11:28 -0500)
committerMarge Bot <eric+marge@anholt.net>
Mon, 16 Dec 2019 19:48:28 +0000 (19:48 +0000)
gcc generates exceptionally bad code for panfrost_pack_work_groups_fused
otherwise ... although that routine is somehow still hot ...

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3067>

src/panfrost/encoder/pan_invocation.c
src/panfrost/include/panfrost-job.h
src/panfrost/pandecode/decode.c

index 8fb1669c9ed18213c63c51557a7c0f9f9cd89009..ecde3da4648259d5426c8c57008117ac7a7b8967 100644 (file)
@@ -91,33 +91,38 @@ panfrost_pack_work_groups_compute(
                 shifts[i + 1] = shifts[i] + bit_count;
         }
 
-        /* We're packed, so upload everything */
-        out->invocation_count = packed;
-        out->size_y_shift = shifts[1];
-        out->size_z_shift = shifts[2];
-        out->workgroups_x_shift = shifts[3];
-        out->workgroups_y_shift = shifts[4];
-        out->workgroups_z_shift = shifts[5];
-
         /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift
          * = 32. This doesn't appear to matter to the hardware, but it's good
          * to be bit-identical. */
 
         if (quirk_graphics && (num_z <= 1))
-                out->workgroups_z_shift = 32;
+                shifts[5] = 32;
 
         /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2,
          * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL
          * compute, it seems it might *always* be 2, but this is suspicious and
          * needs further investigation. (I'm probably just using GL wrong). */
 
+        unsigned shift_2 = shifts[3];
+
         if (quirk_graphics)
-                out->workgroups_x_shift_2 = MAX2(out->workgroups_x_shift, 2);
-        else
-                out->workgroups_x_shift_2 = out->workgroups_x_shift;
+                shift_2 = MAX2(shift_2, 2);
+
+        /* Pack them in */
+        uint32_t packed_shifts =
+                (shifts[1] << 0) |
+                (shifts[2] << 5) |
+                (shifts[3] << 10) |
+                (shifts[4] << 16) |
+                (shifts[5] << 22) |
+                (shift_2 << 28);
+
+        /* Upload the packed bitfields */
+        out->invocation_count = packed;
+        out->invocation_shifts = packed_shifts;
 
         /* TODO: Compute workgroups_x_shift_3 */
-        out->workgroups_x_shift_3 = out->workgroups_x_shift_2;
+        out->workgroups_x_shift_3 = shift_2;
 }
 
 /* Packs vertex/tiler descriptors simultaneously */
@@ -136,12 +141,7 @@ panfrost_pack_work_groups_fused(
 
         /* Copy results over */
         tiler->invocation_count = vertex->invocation_count;
-        tiler->size_y_shift = vertex->size_y_shift;
-        tiler->size_z_shift = vertex->size_z_shift;
-        tiler->workgroups_x_shift = vertex->workgroups_x_shift;
-        tiler->workgroups_x_shift_2 = vertex->workgroups_x_shift_2;
-        tiler->workgroups_y_shift = vertex->workgroups_y_shift;
-        tiler->workgroups_z_shift = vertex->workgroups_z_shift;
+        tiler->invocation_shifts = vertex->invocation_shifts;
 
         /* Set special fields for each */
         vertex->workgroups_x_shift_3 = 5;
index 15c22d8fa092cc237c77dc162741cbbaaf8acbd9..7bf23b09836f750457e34fc74845598d39a844e3 100644 (file)
@@ -937,13 +937,16 @@ struct mali_vertex_tiler_prefix {
          */
         u32 invocation_count;
 
-        u32 size_y_shift : 5;
-        u32 size_z_shift : 5;
-        u32 workgroups_x_shift : 6;
-        u32 workgroups_y_shift : 6;
-        u32 workgroups_z_shift : 6;
-        /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */
-        u32 workgroups_x_shift_2 : 4;
+        /* Bitfield for shifts:
+         *
+         * size_y_shift : 5
+         * size_z_shift : 5
+         * workgroups_x_shift : 6
+         * workgroups_y_shift : 6
+         * workgroups_z_shift : 6
+         * workgroups_x_shift_2 : 4
+         */
+        u32 invocation_shifts;
 
         u32 draw_mode : 4;
         u32 unknown_draw : 22;
index b24ce3f65d1c48c69fe24fb1608459e17906f7b9..0b25e35473558f9c34848f664e87134ac96ced82 100644 (file)
@@ -1674,13 +1674,20 @@ pandecode_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no, bo
          * invocation_count for an explanation.
          */
 
-        unsigned size_x = bits(p->invocation_count, 0, p->size_y_shift) + 1;
-        unsigned size_y = bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1;
-        unsigned size_z = bits(p->invocation_count, p->size_z_shift, p->workgroups_x_shift) + 1;
+        unsigned size_y_shift = bits(p->invocation_shifts, 0, 5);
+        unsigned size_z_shift = bits(p->invocation_shifts, 5, 10);
+        unsigned workgroups_x_shift = bits(p->invocation_shifts, 10, 16);
+        unsigned workgroups_y_shift = bits(p->invocation_shifts, 16, 22);
+        unsigned workgroups_z_shift = bits(p->invocation_shifts, 22, 28);
+        unsigned workgroups_x_shift_2 = bits(p->invocation_shifts, 28, 32);
 
-        unsigned groups_x = bits(p->invocation_count, p->workgroups_x_shift, p->workgroups_y_shift) + 1;
-        unsigned groups_y = bits(p->invocation_count, p->workgroups_y_shift, p->workgroups_z_shift) + 1;
-        unsigned groups_z = bits(p->invocation_count, p->workgroups_z_shift, 32) + 1;
+        unsigned size_x = bits(p->invocation_count, 0, size_y_shift) + 1;
+        unsigned size_y = bits(p->invocation_count, size_y_shift, size_z_shift) + 1;
+        unsigned size_z = bits(p->invocation_count, size_z_shift, workgroups_x_shift) + 1;
+
+        unsigned groups_x = bits(p->invocation_count, workgroups_x_shift, workgroups_y_shift) + 1;
+        unsigned groups_y = bits(p->invocation_count, workgroups_y_shift, workgroups_z_shift) + 1;
+        unsigned groups_z = bits(p->invocation_count, workgroups_z_shift, 32) + 1;
 
         /* Even though we have this decoded, we want to ensure that the
          * representation is "unique" so we don't lose anything by printing only
@@ -1695,31 +1702,21 @@ pandecode_vertex_tiler_prefix(struct mali_vertex_tiler_prefix *p, int job_no, bo
 
         bool canonical =
                 (p->invocation_count == ref.invocation_count) &&
-                (p->size_y_shift == ref.size_y_shift) &&
-                (p->size_z_shift == ref.size_z_shift) &&
-                (p->workgroups_x_shift == ref.workgroups_x_shift) &&
-                (p->workgroups_y_shift == ref.workgroups_y_shift) &&
-                (p->workgroups_z_shift == ref.workgroups_z_shift) &&
-                (p->workgroups_x_shift_2 == ref.workgroups_x_shift_2);
+                (p->invocation_shifts == ref.invocation_shifts);
 
         if (!canonical) {
                 pandecode_msg("XXX: non-canonical workgroups packing\n");
-                pandecode_msg("expected: %X, %d, %d, %d, %d, %d, %d\n",
+                pandecode_msg("expected: %X, %X",
                                 ref.invocation_count,
-                                ref.size_y_shift,
-                                ref.size_z_shift,
-                                ref.workgroups_x_shift,
-                                ref.workgroups_y_shift,
-                                ref.workgroups_z_shift,
-                                ref.workgroups_x_shift_2);
+                                ref.invocation_shifts);
 
                 pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count);
-                pandecode_prop("size_y_shift = %d", p->size_y_shift);
-                pandecode_prop("size_z_shift = %d", p->size_z_shift);
-                pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift);
-                pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift);
-                pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift);
-                pandecode_prop("workgroups_x_shift_2 = %d", p->workgroups_x_shift_2);
+                pandecode_prop("size_y_shift = %d", size_y_shift);
+                pandecode_prop("size_z_shift = %d", size_z_shift);
+                pandecode_prop("workgroups_x_shift = %d", workgroups_x_shift);
+                pandecode_prop("workgroups_y_shift = %d", workgroups_y_shift);
+                pandecode_prop("workgroups_z_shift = %d", workgroups_z_shift);
+                pandecode_prop("workgroups_x_shift_2 = %d", workgroups_x_shift_2);
         }
 
         /* Regardless, print the decode */