batch->maxx = batch->maxy = 0;
batch->transient_offset = 0;
- util_dynarray_init(&batch->headers, batch);
- util_dynarray_init(&batch->gpu_headers, batch);
- util_dynarray_init(&batch->dependencies, batch);
batch->out_sync = panfrost_create_batch_fence(batch);
util_copy_framebuffer_state(&batch->key, key);
panfrost_batch_fence_unreference(*dep);
}
- util_dynarray_fini(&batch->headers);
- util_dynarray_fini(&batch->gpu_headers);
-
/* The out_sync fence lifetime is different from the the batch one
* since other batches might want to wait on a fence of already
* submitted/signaled batch. All we need to do here is make sure the
* Note that it's perfectly fine to re-use a batch with an
* existing clear, we'll just update it with the new clear request.
*/
- if (!batch->last_job.gpu)
+ if (!batch->first_job)
return batch;
/* Otherwise, we need to freeze the existing one and instantiate a new
/* No draw calls, and no clear on the depth/stencil bufs.
* Drawing the wallpaper would be useless.
*/
- if (!batch->last_tiler.gpu &&
+ if (!batch->tiler_dep &&
!(batch->clear & PIPE_CLEAR_DEPTHSTENCIL))
return;
static int
panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
mali_ptr first_job_desc,
- uint32_t reqs,
- struct mali_job_descriptor_header *header)
+ uint32_t reqs)
{
struct panfrost_context *ctx = batch->ctx;
struct pipe_context *gallium = (struct pipe_context *) ctx;
bool is_fragment_shader;
int ret;
- is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job.gpu;
+ is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job;
if (is_fragment_shader)
submit.in_sync_count = 1;
else
static int
panfrost_batch_submit_jobs(struct panfrost_batch *batch)
{
- bool has_draws = batch->first_job.gpu;
- struct mali_job_descriptor_header *header;
+ bool has_draws = batch->first_job;
int ret = 0;
if (has_draws) {
- header = (struct mali_job_descriptor_header *)batch->first_job.cpu;
- ret = panfrost_batch_submit_ioctl(batch, batch->first_job.gpu, 0, header);
+ ret = panfrost_batch_submit_ioctl(batch, batch->first_job, 0);
assert(!ret);
}
- if (batch->first_tiler.gpu || batch->clear) {
- mali_ptr fragjob = panfrost_fragment_job(batch, has_draws, &header);
-
- ret = panfrost_batch_submit_ioctl(batch, fragjob, PANFROST_JD_REQ_FS, header);
+ if (batch->tiler_dep || batch->clear) {
+ mali_ptr fragjob = panfrost_fragment_job(batch, has_draws);
+ ret = panfrost_batch_submit_ioctl(batch, fragjob, PANFROST_JD_REQ_FS);
assert(!ret);
}
int ret;
/* Nothing to do! */
- if (!batch->last_job.gpu && !batch->clear) {
+ if (!batch->first_job && !batch->clear) {
/* Mark the fence as signaled so the fence logic does not try
* to wait on it.
*/
/* Now that all draws are in, we can finally prepare the
* FBD for the batch */
- if (batch->framebuffer.gpu && batch->first_job.gpu) {
+ if (batch->framebuffer.gpu && batch->first_job) {
struct panfrost_context *ctx = batch->ctx;
struct pipe_context *gallium = (struct pipe_context *) ctx;
struct panfrost_screen *screen = pan_screen(gallium->screen);
panfrost_attach_mfbd(batch, ~0);
}
- panfrost_scoreboard_link_batch(batch);
+ panfrost_scoreboard_initialize_tiler(batch);
ret = panfrost_batch_submit_jobs(batch);
unsigned minx, miny;
unsigned maxx, maxy;
- /* CPU pointers to the job descriptor headers. next_job is only
- * set at submit time (since only then are all the dependencies
- * known). The upshot is that this is append-only.
- *
- * These arrays contain the headers for the "primary batch", our jargon
- * referring to the part of the panfrost_job that actually contains
- * meaningful work. In an OpenGL ES setting, that means the
- * WRITE_VALUE/VERTEX/TILER jobs. Excluded is specifically the FRAGMENT
- * job, which is sent on as a secondary batch containing only a single
- * hardware job. Since there's one and only one FRAGMENT job issued per
- * panfrost_job, there is no need to do any scoreboarding / management;
- * it's easy enough to open-code it and it's not like we can get any
- * better anyway. */
- struct util_dynarray headers;
-
- /* (And the GPU versions; TODO maybe combine) */
- struct util_dynarray gpu_headers;
-
- /* The last job in the primary batch */
- struct panfrost_transfer last_job;
-
- /* The first/last tiler job */
- struct panfrost_transfer first_tiler;
- struct panfrost_transfer last_tiler;
-
- /* The first vertex job used as the input to a tiler job */
- struct panfrost_transfer first_vertex_for_tiler;
-
- /* The first job. Notice we've created a linked list */
- struct panfrost_transfer first_job;
+ /* The first job in the batch */
+ mali_ptr first_job;
/* The number of jobs in the primary batch, essentially */
unsigned job_index;
+ /* A CPU-side pointer to the previous job for next_job linking */
+ struct mali_job_descriptor_header *prev_job;
+
+ /* The dependency for tiler jobs (i.e. the index of the last emitted
+ * tiler job, or zero if none have been emitted) */
+ unsigned tiler_dep;
+
+ /* The job index of the WRITE_VALUE job (before it has been created) */
+ unsigned write_value_index;
+
/* BOs referenced -- will be used for flushing logic */
struct hash_table *bos;
/* Scoreboarding */
-void
-panfrost_scoreboard_queue_compute_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer job);
-
-void
-panfrost_scoreboard_queue_vertex_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer vertex,
- bool requires_tiling);
-
-void
-panfrost_scoreboard_queue_tiler_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer tiler);
-
-void
-panfrost_scoreboard_queue_fused_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer vertex,
- struct panfrost_transfer tiler);
-void
-panfrost_scoreboard_queue_fused_job_prepend(
- struct panfrost_batch *batch,
- struct panfrost_transfer vertex,
- struct panfrost_transfer tiler);
+unsigned
+panfrost_new_job(
+ struct panfrost_batch *batch,
+ enum mali_job_type type,
+ bool barrier,
+ unsigned local_dep,
+ void *payload, size_t payload_size,
+ bool inject);
-void
-panfrost_scoreboard_link_batch(struct panfrost_batch *batch);
+void panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch);
bool
panfrost_batch_is_scanout(struct panfrost_batch *batch);
*
*/
-/* Coerce a panfrost_transfer to a header */
-
-static inline struct mali_job_descriptor_header *
-job_descriptor_header(struct panfrost_transfer t)
+/* Generates, uploads, and queues a a new job. All fields are written in order
+ * except for next_job accounting (TODO: Should we be clever and defer the
+ * upload of the header here until next job to keep the access pattern totally
+ * linear? Or is that just a micro op at this point?). Returns the generated
+ * index for dep management.
+ *
+ * Inject is used to inject a job at the front, for wallpapering. If you are
+ * not wallpapering and set this, dragons will eat you. */
+
+unsigned
+panfrost_new_job(
+ struct panfrost_batch *batch,
+ enum mali_job_type type,
+ bool barrier,
+ unsigned local_dep,
+ void *payload, size_t payload_size,
+ bool inject)
{
- return (struct mali_job_descriptor_header *) t.cpu;
-}
+ unsigned global_dep = 0;
+
+ if (type == JOB_TYPE_TILER) {
+ /* Tiler jobs must be chained, and the first tiler job must
+ * depend on the write value job, whose index we reserve now */
+
+ if (batch->tiler_dep)
+ global_dep = batch->tiler_dep;
+ else {
+ batch->write_value_index = ++batch->job_index;
+ global_dep = batch->write_value_index;
+ }
+ }
-static void
-panfrost_assign_index(
- struct panfrost_batch *batch,
- struct panfrost_transfer transfer)
-{
/* Assign the index */
unsigned index = ++batch->job_index;
- job_descriptor_header(transfer)->job_index = index;
-}
-/* Helper to add a dependency to a job */
+ struct mali_job_descriptor_header job = {
+ .job_descriptor_size = 1,
+ .job_type = type,
+ .job_barrier = barrier,
+ .job_index = index,
+ .job_dependency_index_1 = local_dep,
+ .job_dependency_index_2 = global_dep,
+ };
-static void
-panfrost_add_dependency(
- struct panfrost_transfer depender,
- struct panfrost_transfer dependent)
-{
+ if (inject)
+ job.next_job = batch->first_job;
- struct mali_job_descriptor_header *first =
- job_descriptor_header(dependent);
+ struct panfrost_transfer transfer = panfrost_allocate_transient(batch, sizeof(job) + payload_size);
+ memcpy(transfer.cpu, &job, sizeof(job));
+ memcpy(transfer.cpu + sizeof(job), payload, payload_size);
- struct mali_job_descriptor_header *second =
- job_descriptor_header(depender);
+ if (inject) {
+ batch->first_job = transfer.gpu;
+ return index;
+ }
- /* Look for an open slot */
+ /* Form a chain */
+ if (type == JOB_TYPE_TILER)
+ batch->tiler_dep = index;
- if (!second->job_dependency_index_1)
- second->job_dependency_index_1 = first->job_index;
- else if (!second->job_dependency_index_2)
- second->job_dependency_index_2 = first->job_index;
+ if (batch->prev_job)
+ batch->prev_job->next_job = transfer.gpu;
else
- unreachable("No available slot for new dependency");
-}
-
-/* Queues a job WITHOUT updating pointers. Be careful. */
-
-static void
-panfrost_scoreboard_queue_job_internal(
- struct panfrost_batch *batch,
- struct panfrost_transfer job)
-{
- panfrost_assign_index(batch, job);
-
- /* Queue a pointer to the job */
- util_dynarray_append(&batch->headers, void*, job.cpu);
- util_dynarray_append(&batch->gpu_headers, mali_ptr, job.gpu);
-}
-
-
-/* Queues a compute job, with no special dependencies. This is a bit of a
- * misnomer -- internally, all job types are queued with this function, but
- * outside of this file, it's for pure compute jobs */
-
-void
-panfrost_scoreboard_queue_compute_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer job)
-{
- panfrost_scoreboard_queue_job_internal(batch, job);
-
- /* Update the linked list metadata as appropriate */
- batch->last_job = job;
-
- if (!batch->first_job.gpu)
- batch->first_job = job;
-}
-
-/* Queues a vertex job. There are no special dependencies yet, but if
- * tiling is required (anytime 'rasterize discard' is disabled), we have
- * some extra bookkeeping for later */
-
-void
-panfrost_scoreboard_queue_vertex_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer vertex,
- bool requires_tiling)
-{
- panfrost_scoreboard_queue_compute_job(batch, vertex);
+ batch->first_job = transfer.gpu;
- if (requires_tiling && !batch->first_vertex_for_tiler.gpu)
- batch->first_vertex_for_tiler = vertex;
+ batch->prev_job = (struct mali_job_descriptor_header *) transfer.cpu;
+ return index;
}
-/* Queues a tiler job, respecting the dependency of each tiler job on the
- * previous */
+/* Generates a write value job, used to initialize the tiler structures. Note
+ * this is called right before frame submission. */
void
-panfrost_scoreboard_queue_tiler_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer tiler)
-{
- panfrost_scoreboard_queue_compute_job(batch, tiler);
-
- if (!batch->first_tiler.gpu)
- batch->first_tiler = tiler;
-
- if (batch->last_tiler.gpu)
- panfrost_add_dependency(tiler, batch->last_tiler);
-
- batch->last_tiler = tiler;
-}
-
-/* Queues a fused (vertex/tiler) job, or a pair of vertex/tiler jobs if
- * fused jobs are not supported (the default until Bifrost rolls out) */
-
-void
-panfrost_scoreboard_queue_fused_job(
- struct panfrost_batch *batch,
- struct panfrost_transfer vertex,
- struct panfrost_transfer tiler)
-{
- panfrost_scoreboard_queue_vertex_job(batch, vertex, true);
- panfrost_scoreboard_queue_tiler_job(batch, tiler);
- panfrost_add_dependency(tiler, vertex);
-}
-
-/* Queues a fused (vertex/tiler) job prepended *before* the usual set, used for
- * wallpaper blits */
-
-void
-panfrost_scoreboard_queue_fused_job_prepend(
- struct panfrost_batch *batch,
- struct panfrost_transfer vertex,
- struct panfrost_transfer tiler)
+panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch)
{
- /* Sanity check */
- assert(batch->last_tiler.gpu);
- assert(batch->first_tiler.gpu);
-
- /* First, we add the vertex job directly to the queue, forcing it to
- * the front */
-
- panfrost_scoreboard_queue_job_internal(batch, vertex);
- batch->first_job = vertex;
- batch->first_vertex_for_tiler = vertex;
-
- /* Similarly, we add the tiler job directly to the queue, forcing it to
- * the front (second place), manually setting the tiler on vertex
- * dependency (since this is pseudofused) and forcing a dependency of
- * the now-second tiler on us (since all tiler jobs are linked in order
- * and we're injecting ourselves at the front) */
+ /* Check if we even need tiling */
+ if (!batch->tiler_dep)
+ return;
- panfrost_scoreboard_queue_job_internal(batch, tiler);
- panfrost_add_dependency(tiler, vertex);
- panfrost_add_dependency(batch->first_tiler, tiler);
- batch->first_tiler = tiler;
-}
+ /* Okay, we do. Let's generate it. We'll need the job's polygon list
+ * regardless of size. */
-/* Generates a write value job, used to initialize the tiler structures. */
+ mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch,
+ MALI_TILER_MINIMUM_HEADER_SIZE);
-static struct panfrost_transfer
-panfrost_write_value_job(struct panfrost_batch *batch, mali_ptr polygon_list)
-{
struct mali_job_descriptor_header job = {
.job_type = JOB_TYPE_WRITE_VALUE,
+ .job_index = batch->write_value_index,
.job_descriptor_size = 1,
+ .next_job = batch->first_job
};
struct mali_payload_write_value payload = {
memcpy(transfer.cpu, &job, sizeof(job));
memcpy(transfer.cpu + sizeof(job), &payload, sizeof(payload));
- return transfer;
-}
-
-/* If there are any tiler jobs, we need to initialize the tiler by writing
- * zeroes to a magic tiler structure. We do so via a WRITE_VALUE job linked to
- * the first vertex job feeding into tiling. */
-
-static void
-panfrost_scoreboard_initialize_tiler(struct panfrost_batch *batch)
-{
- /* Check if we even need tiling */
- if (!batch->last_tiler.gpu)
- return;
-
- /* Okay, we do. Let's generate it. We'll need the job's polygon list
- * regardless of size. */
-
- mali_ptr polygon_list = panfrost_batch_get_polygon_list(batch,
- MALI_TILER_MINIMUM_HEADER_SIZE);
-
- struct panfrost_transfer job =
- panfrost_write_value_job(batch, polygon_list);
-
- /* Queue it */
- panfrost_scoreboard_queue_compute_job(batch, job);
-
- /* Tiler jobs need us */
- panfrost_add_dependency(batch->first_tiler, job);
-}
-
-/* Once all jobs have been added to a batch and we're ready to submit, we need
- * to order them to set each of the next_job fields, obeying the golden rule:
- * "A job's dependencies must appear earlier in the job chain than itself".
- * Fortunately, computing this job chain is a well-studied graph theory problem
- * known as "topological sorting", which has linear time algorithms. We let
- * each job represent a node, each dependency a directed edge, and the entire
- * set of jobs to be a dependency graph. This graph is inherently acyclic, as
- * otherwise there are unresolveable dependencies.
- *
- * We implement Kahn's algorithm here to compute the next_job chain:
- * https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
- *
- * A few implementation notes: we represent S explicitly with a bitset, L
- * implicitly in the next_job fields. The indices of the bitset are off-by-one:
- * nodes are numbered [0, node_count - 1], whereas in reality job_index in the
- * hardware and dependencies are [1, node_count].
- *
- * We represent edge removal implicitly with another pair of bitsets, rather
- * than explicitly removing the edges, since we need to keep the dependencies
- * there for the hardware.
- */
-
-#define DESCRIPTOR_FOR_NODE(count) \
- *(util_dynarray_element(&batch->headers, \
- struct mali_job_descriptor_header*, count))
-
-#define GPU_ADDRESS_FOR_NODE(count) \
- *(util_dynarray_element(&batch->gpu_headers, \
- mali_ptr, count))
-
-void
-panfrost_scoreboard_link_batch(struct panfrost_batch *batch)
-{
- /* Finalize the batch */
- panfrost_scoreboard_initialize_tiler(batch);
-
- /* Let no_incoming represent the set S described. */
-
- unsigned node_count = batch->job_index;
-
- size_t sz = BITSET_WORDS(node_count) * sizeof(BITSET_WORD);
- BITSET_WORD *no_incoming = calloc(sz, 1);
-
- /* Sets for edges being removed in dep 1 or 2 respectively */
-
- BITSET_WORD *edge_removal_1 = calloc(sz, 1);
- BITSET_WORD *edge_removal_2 = calloc(sz, 1);
-
- /* We compute no_incoming by traversing the batch. Simultaneously, we
- * would like to keep track of a parity-reversed version of the
- * dependency graph. Dependency indices are 16-bit and in practice (for
- * ES3.0, at least), we can guarantee a given node will be depended on
- * by no more than one other nodes. P.f:
- *
- * Proposition: Given a node N of type T, no more than one other node
- * depends on N.
- *
- * If type is WRITE_VALUE: The only dependency added against us is from
- * the first tiler job, so there is 1 dependent.
- *
- * If type is VERTEX: If there is a tiler node, that tiler node depends
- * on us; if there is not (transform feedback), nothing depends on us.
- * Therefore there is at most 1 dependent.
- *
- * If type is TILER: If there is another TILER job in succession, that
- * node depends on us. No other job type depends on us. Therefore there
- * is at most 1 dependent.
- *
- * If type is FRAGMENT: This type cannot be in a primary chain, so it
- * is irrelevant. Just for kicks, nobody would depend on us, so there
- * are zero dependents, so it holds anyway.
- *
- * TODO: Revise this logic for ES3.1 and above. This result may not
- * hold for COMPUTE/FUSED/GEOMETRY jobs; we might need to special case
- * those. Can FBO dependencies be expressed within a chain?
- * ---
- *
- * Point is, we only need to hold a single dependent, which is a pretty
- * helpful result.
- */
-
- unsigned *dependents = calloc(node_count, sizeof(unsigned));
-
- for (unsigned i = 0; i < node_count; ++i) {
- struct mali_job_descriptor_header *node = DESCRIPTOR_FOR_NODE(i);
-
- unsigned dep_1 = node->job_dependency_index_1;
- unsigned dep_2 = node->job_dependency_index_2;
-
- /* Record no_incoming info for this node */
-
- if (!(dep_1 || dep_2))
- BITSET_SET(no_incoming, i);
-
- /* Record this node as the dependent of each of its
- * dependencies */
-
- if (dep_1) {
- assert(!dependents[dep_1 - 1]);
- dependents[dep_1 - 1] = i + 1;
- }
-
- if (dep_2) {
- assert(!dependents[dep_2 - 1]);
- dependents[dep_2 - 1] = i + 1;
- }
- }
-
- /* No next_job fields are set at the beginning, so L is implciitly the
- * empty set. As next_job fields are filled, L is implicitly set. Tail
- * is the tail of L, however. */
-
- struct mali_job_descriptor_header *tail = NULL;
-
- /* We iterate, popping off elements of S. A simple foreach won't do,
- * since we mutate S as we go (even adding elements) */
-
- unsigned arr_size = BITSET_WORDS(node_count);
-
- for (unsigned node_n_1 = __bitset_ffs(no_incoming, arr_size);
- (node_n_1 != 0);
- node_n_1 = __bitset_ffs(no_incoming, arr_size)) {
-
- unsigned node_n = node_n_1 - 1;
-
- /* We've got a node n, pop it off */
- BITSET_CLEAR(no_incoming, node_n);
-
- /* Add it to the list */
- struct mali_job_descriptor_header *n =
- DESCRIPTOR_FOR_NODE(node_n);
-
- mali_ptr addr = GPU_ADDRESS_FOR_NODE(node_n);
-
- if (tail) {
- /* Link us to the last node */
- tail->next_job = addr;
- } else {
- /* We are the first/last node */
- batch->first_job.cpu = (uint8_t *) n;
- batch->first_job.gpu = addr;
- }
-
- tail = n;
-
- /* Grab the dependent, if there is one */
- unsigned node_m_1 = dependents[node_n];
-
- if (node_m_1) {
- unsigned node_m = node_m_1 - 1;
-
- struct mali_job_descriptor_header *m =
- DESCRIPTOR_FOR_NODE(node_m);
-
- /* Get the deps, accounting for removal */
- unsigned dep_1 = m->job_dependency_index_1;
- unsigned dep_2 = m->job_dependency_index_2;
-
- if (BITSET_TEST(edge_removal_1, node_m))
- dep_1 = 0;
-
- if (BITSET_TEST(edge_removal_2, node_m))
- dep_2 = 0;
-
- /* Pretend to remove edges */
- if (dep_1 == node_n_1) {
- BITSET_SET(edge_removal_1, node_m);
- dep_1 = 0;
- } else if (dep_2 == node_n_1) {
- BITSET_SET(edge_removal_2, node_m);
- dep_2 = 0;
- } else {
- /* This node has no relevant dependencies */
- assert(0);
- }
-
- /* Are there edges left? If not, add us to S */
- bool has_edges = dep_1 || dep_2;
-
- if (!has_edges)
- BITSET_SET(no_incoming, node_m);
- }
- }
-
- /* Cleanup */
- free(no_incoming);
- free(dependents);
- free(edge_removal_1);
- free(edge_removal_2);
-
+ batch->first_job = transfer.gpu;
}