+ if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
+ RADV_CMD_FLAG_FLUSH_AND_INV_DB |
+ RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+ RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
+ /* If we have to wait for idle, set all states first, so that
+ * all SET packets are processed in parallel with previous draw
+ * calls. Then upload descriptors, set shader pointers, and
+ * dispatch, and prefetch at the end. This ensures that the
+ * time the CUs are idle is very short. (there are only SET_SH
+ * packets between the wait and the draw)
+ */
+ radv_emit_compute_pipeline(cmd_buffer);
+ si_emit_cache_flush(cmd_buffer);
+ /* <-- CUs are idle here --> */
+
+ radv_upload_compute_shader_descriptors(cmd_buffer);
+
+ radv_emit_dispatch_packets(cmd_buffer, info);
+ /* <-- CUs are busy here --> */
+
+ /* Start prefetches after the dispatch has been started. Both
+ * will run in parallel, but starting the dispatch first is
+ * more important.
+ */
+ if (pipeline_is_dirty) {
+ radv_emit_shader_prefetch(cmd_buffer,
+ pipeline->shaders[MESA_SHADER_COMPUTE]);
+ }
+ } else {
+ /* If we don't wait for idle, start prefetches first, then set
+ * states, and dispatch at the end.
+ */
+ si_emit_cache_flush(cmd_buffer);
+
+ if (pipeline_is_dirty) {
+ radv_emit_shader_prefetch(cmd_buffer,
+ pipeline->shaders[MESA_SHADER_COMPUTE]);
+ }
+
+ radv_upload_compute_shader_descriptors(cmd_buffer);
+
+ radv_emit_compute_pipeline(cmd_buffer);
+ radv_emit_dispatch_packets(cmd_buffer, info);
+ }