Revert "i965/batch: avoid reverting batch buffer if saved state is an empty"
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35 #include "common/gen_gem.h"
36
37 #include "util/hash_table.h"
38
39 #include <xf86drm.h>
40 #include <i915_drm.h>
41
42 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
43
44 /**
45 * Target sizes of the batch and state buffers. We create the initial
46 * buffers at these sizes, and flush when they're nearly full. If we
47 * underestimate how close we are to the end, and suddenly need more space
48 * in the middle of a draw, we can grow the buffers, and finish the draw.
49 * At that point, we'll be over our target size, so the next operation
50 * should flush. Each time we flush the batch, we recreate both buffers
51 * at the original target size, so it doesn't grow without bound.
52 */
53 #define BATCH_SZ (20 * 1024)
54 #define STATE_SZ (16 * 1024)
55
56 static void
57 intel_batchbuffer_reset(struct brw_context *brw);
58 static void
59 brw_new_batch(struct brw_context *brw);
60
61 static void
62 dump_validation_list(struct intel_batchbuffer *batch)
63 {
64 fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
65
66 for (int i = 0; i < batch->exec_count; i++) {
67 uint64_t flags = batch->validation_list[i].flags;
68 assert(batch->validation_list[i].handle ==
69 batch->exec_bos[i]->gem_handle);
70 fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%016llx%s (%"PRIu64"B)\n",
71 i,
72 batch->validation_list[i].handle,
73 batch->exec_bos[i]->name,
74 batch->exec_bos[i],
75 (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b",
76 (flags & EXEC_OBJECT_WRITE) ? " write)" : ")",
77 batch->validation_list[i].offset,
78 (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "",
79 batch->exec_bos[i]->size);
80 }
81 }
82
83 static struct gen_batch_decode_bo
84 decode_get_bo(void *v_brw, uint64_t address)
85 {
86 struct brw_context *brw = v_brw;
87 struct intel_batchbuffer *batch = &brw->batch;
88
89 for (int i = 0; i < batch->exec_count; i++) {
90 struct brw_bo *bo = batch->exec_bos[i];
91 /* The decoder zeroes out the top 16 bits, so we need to as well */
92 uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
93
94 if (address >= bo_address && address < bo_address + bo->size) {
95 return (struct gen_batch_decode_bo) {
96 .addr = address,
97 .size = bo->size,
98 .map = brw_bo_map(brw, bo, MAP_READ) + (address - bo_address),
99 };
100 }
101 }
102
103 return (struct gen_batch_decode_bo) { };
104 }
105
106 static unsigned
107 decode_get_state_size(void *v_brw, uint32_t offset_from_dsba)
108 {
109 struct brw_context *brw = v_brw;
110 struct intel_batchbuffer *batch = &brw->batch;
111 struct hash_entry *entry =
112 _mesa_hash_table_search(batch->state_batch_sizes,
113 (void *) (uintptr_t) offset_from_dsba);
114 return entry ? (uintptr_t) entry->data : 0;
115 }
116
117 static bool
118 uint_key_compare(const void *a, const void *b)
119 {
120 return a == b;
121 }
122
123 static uint32_t
124 uint_key_hash(const void *key)
125 {
126 return (uintptr_t) key;
127 }
128
129 static void
130 init_reloc_list(struct brw_reloc_list *rlist, int count)
131 {
132 rlist->reloc_count = 0;
133 rlist->reloc_array_size = count;
134 rlist->relocs = malloc(rlist->reloc_array_size *
135 sizeof(struct drm_i915_gem_relocation_entry));
136 }
137
138 void
139 intel_batchbuffer_init(struct brw_context *brw)
140 {
141 struct intel_screen *screen = brw->screen;
142 struct intel_batchbuffer *batch = &brw->batch;
143 const struct gen_device_info *devinfo = &screen->devinfo;
144
145 batch->use_shadow_copy = !devinfo->has_llc;
146
147 init_reloc_list(&batch->batch_relocs, 250);
148 init_reloc_list(&batch->state_relocs, 250);
149
150 batch->batch.map = NULL;
151 batch->state.map = NULL;
152 batch->exec_count = 0;
153 batch->exec_array_size = 100;
154 batch->exec_bos =
155 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
156 batch->validation_list =
157 malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
158
159 if (INTEL_DEBUG & DEBUG_BATCH) {
160 batch->state_batch_sizes =
161 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
162
163 const unsigned decode_flags =
164 GEN_BATCH_DECODE_FULL |
165 ((INTEL_DEBUG & DEBUG_COLOR) ? GEN_BATCH_DECODE_IN_COLOR : 0) |
166 GEN_BATCH_DECODE_OFFSETS |
167 GEN_BATCH_DECODE_FLOATS;
168
169 gen_batch_decode_ctx_init(&batch->decoder, devinfo, stderr,
170 decode_flags, NULL, decode_get_bo,
171 decode_get_state_size, brw);
172 batch->decoder.max_vbo_decoded_lines = 100;
173 }
174
175 batch->use_batch_first =
176 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
177
178 /* PIPE_CONTROL needs a w/a but only on gen6 */
179 batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
180 if (devinfo->gen == 6)
181 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
182
183 intel_batchbuffer_reset(brw);
184 }
185
186 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
187
188 static unsigned
189 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
190 {
191 unsigned index = READ_ONCE(bo->index);
192
193 if (index < batch->exec_count && batch->exec_bos[index] == bo)
194 return index;
195
196 /* May have been shared between multiple active batches */
197 for (index = 0; index < batch->exec_count; index++) {
198 if (batch->exec_bos[index] == bo)
199 return index;
200 }
201
202 brw_bo_reference(bo);
203
204 if (batch->exec_count == batch->exec_array_size) {
205 batch->exec_array_size *= 2;
206 batch->exec_bos =
207 realloc(batch->exec_bos,
208 batch->exec_array_size * sizeof(batch->exec_bos[0]));
209 batch->validation_list =
210 realloc(batch->validation_list,
211 batch->exec_array_size * sizeof(batch->validation_list[0]));
212 }
213
214 batch->validation_list[batch->exec_count] =
215 (struct drm_i915_gem_exec_object2) {
216 .handle = bo->gem_handle,
217 .offset = bo->gtt_offset,
218 .flags = bo->kflags,
219 };
220
221 bo->index = batch->exec_count;
222 batch->exec_bos[batch->exec_count] = bo;
223 batch->aperture_space += bo->size;
224
225 return batch->exec_count++;
226 }
227
228 static void
229 recreate_growing_buffer(struct brw_context *brw,
230 struct brw_growing_bo *grow,
231 const char *name, unsigned size,
232 enum brw_memory_zone memzone)
233 {
234 struct intel_screen *screen = brw->screen;
235 struct intel_batchbuffer *batch = &brw->batch;
236 struct brw_bufmgr *bufmgr = screen->bufmgr;
237
238 /* We can't grow buffers when using softpin, so just overallocate them. */
239 if (brw_using_softpin(bufmgr))
240 size *= 2;
241
242 grow->bo = brw_bo_alloc(bufmgr, name, size, memzone);
243 grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
244 grow->partial_bo = NULL;
245 grow->partial_bo_map = NULL;
246 grow->partial_bytes = 0;
247 grow->memzone = memzone;
248
249 if (batch->use_shadow_copy)
250 grow->map = realloc(grow->map, grow->bo->size);
251 else
252 grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
253 }
254
255 static void
256 intel_batchbuffer_reset(struct brw_context *brw)
257 {
258 struct intel_batchbuffer *batch = &brw->batch;
259
260 if (batch->last_bo != NULL) {
261 brw_bo_unreference(batch->last_bo);
262 batch->last_bo = NULL;
263 }
264 batch->last_bo = batch->batch.bo;
265
266 recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ,
267 BRW_MEMZONE_OTHER);
268 batch->map_next = batch->batch.map;
269
270 recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ,
271 BRW_MEMZONE_DYNAMIC);
272
273 /* Avoid making 0 a valid state offset - otherwise the decoder will try
274 * and decode data when we use offset 0 as a null pointer.
275 */
276 batch->state_used = 1;
277
278 add_exec_bo(batch, batch->batch.bo);
279 assert(batch->batch.bo->index == 0);
280
281 batch->needs_sol_reset = false;
282 batch->state_base_address_emitted = false;
283
284 if (batch->state_batch_sizes)
285 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
286 }
287
288 static void
289 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
290 {
291 intel_batchbuffer_reset(brw);
292 brw_cache_sets_clear(brw);
293 }
294
295 void
296 intel_batchbuffer_save_state(struct brw_context *brw)
297 {
298 brw->batch.saved.map_next = brw->batch.map_next;
299 brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
300 brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
301 brw->batch.saved.exec_count = brw->batch.exec_count;
302 }
303
304 void
305 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
306 {
307 for (int i = brw->batch.saved.exec_count;
308 i < brw->batch.exec_count; i++) {
309 brw_bo_unreference(brw->batch.exec_bos[i]);
310 }
311 brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
312 brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
313 brw->batch.exec_count = brw->batch.saved.exec_count;
314
315 brw->batch.map_next = brw->batch.saved.map_next;
316 if (USED_BATCH(brw->batch) == 0)
317 brw_new_batch(brw);
318 }
319
320 void
321 intel_batchbuffer_free(struct intel_batchbuffer *batch)
322 {
323 if (batch->use_shadow_copy) {
324 free(batch->batch.map);
325 free(batch->state.map);
326 }
327
328 for (int i = 0; i < batch->exec_count; i++) {
329 brw_bo_unreference(batch->exec_bos[i]);
330 }
331 free(batch->batch_relocs.relocs);
332 free(batch->state_relocs.relocs);
333 free(batch->exec_bos);
334 free(batch->validation_list);
335
336 brw_bo_unreference(batch->last_bo);
337 brw_bo_unreference(batch->batch.bo);
338 brw_bo_unreference(batch->state.bo);
339 if (batch->state_batch_sizes) {
340 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
341 gen_batch_decode_ctx_finish(&batch->decoder);
342 }
343 }
344
345 /**
346 * Finish copying the old batch/state buffer's contents to the new one
347 * after we tried to "grow" the buffer in an earlier operation.
348 */
349 static void
350 finish_growing_bos(struct brw_growing_bo *grow)
351 {
352 struct brw_bo *old_bo = grow->partial_bo;
353 if (!old_bo)
354 return;
355
356 memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
357
358 grow->partial_bo = NULL;
359 grow->partial_bo_map = NULL;
360 grow->partial_bytes = 0;
361
362 brw_bo_unreference(old_bo);
363 }
364
365 static void
366 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
367 uint32_t old_handle, uint32_t new_handle)
368 {
369 for (int i = 0; i < rlist->reloc_count; i++) {
370 if (rlist->relocs[i].target_handle == old_handle)
371 rlist->relocs[i].target_handle = new_handle;
372 }
373 }
374
375 /**
376 * Grow either the batch or state buffer to a new larger size.
377 *
378 * We can't actually grow buffers, so we allocate a new one, copy over
379 * the existing contents, and update our lists to refer to the new one.
380 *
381 * Note that this is only temporary - each new batch recreates the buffers
382 * at their original target size (BATCH_SZ or STATE_SZ).
383 */
384 static void
385 grow_buffer(struct brw_context *brw,
386 struct brw_growing_bo *grow,
387 unsigned existing_bytes,
388 unsigned new_size)
389 {
390 struct intel_batchbuffer *batch = &brw->batch;
391 struct brw_bufmgr *bufmgr = brw->bufmgr;
392 struct brw_bo *bo = grow->bo;
393
394 /* We can't grow buffers that are softpinned, as the growing mechanism
395 * involves putting a larger buffer at the same gtt_offset...and we've
396 * only allocated the smaller amount of VMA. Without relocations, this
397 * simply won't work. This should never happen, however.
398 */
399 assert(!(bo->kflags & EXEC_OBJECT_PINNED));
400
401 perf_debug("Growing %s - ran out of space\n", bo->name);
402
403 if (grow->partial_bo) {
404 /* We've already grown once, and now we need to do it again.
405 * Finish our last grow operation so we can start a new one.
406 * This should basically never happen.
407 */
408 perf_debug("Had to grow multiple times");
409 finish_growing_bos(grow);
410 }
411
412 struct brw_bo *new_bo =
413 brw_bo_alloc(bufmgr, bo->name, new_size, grow->memzone);
414
415 /* Copy existing data to the new larger buffer */
416 grow->partial_bo_map = grow->map;
417
418 if (batch->use_shadow_copy) {
419 /* We can't safely use realloc, as it may move the existing buffer,
420 * breaking existing pointers the caller may still be using. Just
421 * malloc a new copy and memcpy it like the normal BO path.
422 *
423 * Use bo->size rather than new_size because the bufmgr may have
424 * rounded up the size, and we want the shadow size to match.
425 */
426 grow->map = malloc(new_bo->size);
427 } else {
428 grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
429 }
430
431 /* Try to put the new BO at the same GTT offset as the old BO (which
432 * we're throwing away, so it doesn't need to be there).
433 *
434 * This guarantees that our relocations continue to work: values we've
435 * already written into the buffer, values we're going to write into the
436 * buffer, and the validation/relocation lists all will match.
437 *
438 * Also preserve kflags for EXEC_OBJECT_CAPTURE.
439 */
440 new_bo->gtt_offset = bo->gtt_offset;
441 new_bo->index = bo->index;
442 new_bo->kflags = bo->kflags;
443
444 /* Batch/state buffers are per-context, and if we've run out of space,
445 * we must have actually used them before, so...they will be in the list.
446 */
447 assert(bo->index < batch->exec_count);
448 assert(batch->exec_bos[bo->index] == bo);
449
450 /* Update the validation list to use the new BO. */
451 batch->validation_list[bo->index].handle = new_bo->gem_handle;
452
453 if (!batch->use_batch_first) {
454 /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
455 * update the relocation list entries to point at the new BO as well.
456 * (With newer kernels, the "handle" is an offset into the validation
457 * list, which remains unchanged, so we can skip this.)
458 */
459 replace_bo_in_reloc_list(&batch->batch_relocs,
460 bo->gem_handle, new_bo->gem_handle);
461 replace_bo_in_reloc_list(&batch->state_relocs,
462 bo->gem_handle, new_bo->gem_handle);
463 }
464
465 /* Exchange the two BOs...without breaking pointers to the old BO.
466 *
467 * Consider this scenario:
468 *
469 * 1. Somebody calls brw_state_batch() to get a region of memory, and
470 * and then creates a brw_address pointing to brw->batch.state.bo.
471 * 2. They then call brw_state_batch() a second time, which happens to
472 * grow and replace the state buffer. They then try to emit a
473 * relocation to their first section of memory.
474 *
475 * If we replace the brw->batch.state.bo pointer at step 2, we would
476 * break the address created in step 1. They'd have a pointer to the
477 * old destroyed BO. Emitting a relocation would add this dead BO to
478 * the validation list...causing /both/ statebuffers to be in the list,
479 * and all kinds of disasters.
480 *
481 * This is not a contrived case - BLORP vertex data upload hits this.
482 *
483 * There are worse scenarios too. Fences for GL sync objects reference
484 * brw->batch.batch.bo. If we replaced the batch pointer when growing,
485 * we'd need to chase down every fence and update it to point to the
486 * new BO. Otherwise, it would refer to a "batch" that never actually
487 * gets submitted, and would fail to trigger.
488 *
489 * To work around both of these issues, we transmutate the buffers in
490 * place, making the existing struct brw_bo represent the new buffer,
491 * and "new_bo" represent the old BO. This is highly unusual, but it
492 * seems like a necessary evil.
493 *
494 * We also defer the memcpy of the existing batch's contents. Callers
495 * may make multiple brw_state_batch calls, and retain pointers to the
496 * old BO's map. We'll perform the memcpy in finish_growing_bo() when
497 * we finally submit the batch, at which point we've finished uploading
498 * state, and nobody should have any old references anymore.
499 *
500 * To do that, we keep a reference to the old BO in grow->partial_bo,
501 * and store the number of bytes to copy in grow->partial_bytes. We
502 * can monkey with the refcounts directly without atomics because these
503 * are per-context BOs and they can only be touched by this thread.
504 */
505 assert(new_bo->refcount == 1);
506 new_bo->refcount = bo->refcount;
507 bo->refcount = 1;
508
509 struct brw_bo tmp;
510 memcpy(&tmp, bo, sizeof(struct brw_bo));
511 memcpy(bo, new_bo, sizeof(struct brw_bo));
512 memcpy(new_bo, &tmp, sizeof(struct brw_bo));
513
514 grow->partial_bo = new_bo; /* the one reference of the OLD bo */
515 grow->partial_bytes = existing_bytes;
516 }
517
518 void
519 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz)
520 {
521 struct intel_batchbuffer *batch = &brw->batch;
522
523 const unsigned batch_used = USED_BATCH(*batch) * 4;
524 if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
525 intel_batchbuffer_flush(brw);
526 } else if (batch_used + sz >= batch->batch.bo->size) {
527 const unsigned new_size =
528 MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
529 MAX_BATCH_SIZE);
530 grow_buffer(brw, &batch->batch, batch_used, new_size);
531 batch->map_next = (void *) batch->batch.map + batch_used;
532 assert(batch_used + sz < batch->batch.bo->size);
533 }
534 }
535
536 /**
537 * Called when starting a new batch buffer.
538 */
539 static void
540 brw_new_batch(struct brw_context *brw)
541 {
542 /* Unreference any BOs held by the previous batch, and reset counts. */
543 for (int i = 0; i < brw->batch.exec_count; i++) {
544 brw_bo_unreference(brw->batch.exec_bos[i]);
545 brw->batch.exec_bos[i] = NULL;
546 }
547 brw->batch.batch_relocs.reloc_count = 0;
548 brw->batch.state_relocs.reloc_count = 0;
549 brw->batch.exec_count = 0;
550 brw->batch.aperture_space = 0;
551
552 brw_bo_unreference(brw->batch.state.bo);
553
554 /* Create a new batchbuffer and reset the associated state: */
555 intel_batchbuffer_reset_and_clear_render_cache(brw);
556
557 /* If the kernel supports hardware contexts, then most hardware state is
558 * preserved between batches; we only need to re-emit state that is required
559 * to be in every batch. Otherwise we need to re-emit all the state that
560 * would otherwise be stored in the context (which for all intents and
561 * purposes means everything).
562 */
563 if (brw->hw_ctx == 0) {
564 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
565 brw_upload_invariant_state(brw);
566 }
567
568 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
569
570 brw->ib.index_size = -1;
571
572 /* We need to periodically reap the shader time results, because rollover
573 * happens every few seconds. We also want to see results every once in a
574 * while, because many programs won't cleanly destroy our context, so the
575 * end-of-run printout may not happen.
576 */
577 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
578 brw_collect_and_report_shader_time(brw);
579 }
580
581 /**
582 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
583 * sending it off.
584 *
585 * This function can emit state (say, to preserve registers that aren't saved
586 * between batches).
587 */
588 static void
589 brw_finish_batch(struct brw_context *brw)
590 {
591 const struct gen_device_info *devinfo = &brw->screen->devinfo;
592
593 brw->batch.no_wrap = true;
594
595 /* Capture the closing pipeline statistics register values necessary to
596 * support query objects (in the non-hardware context world).
597 */
598 brw_emit_query_end(brw);
599
600 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
601 * assume that the L3 cache is configured according to the hardware
602 * defaults. On Kernel 4.16+, we no longer need to do this.
603 */
604 if (devinfo->gen >= 7 &&
605 !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
606 gen7_restore_default_l3_config(brw);
607
608 if (devinfo->is_haswell) {
609 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
610 * 3DSTATE_CC_STATE_POINTERS > "Note":
611 *
612 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
613 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
614 *
615 * From the example in the docs, it seems to expect a regular pipe control
616 * flush here as well. We may have done it already, but meh.
617 *
618 * See also WaAvoidRCZCounterRollover.
619 */
620 brw_emit_mi_flush(brw);
621 BEGIN_BATCH(2);
622 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
623 OUT_BATCH(brw->cc.state_offset | 1);
624 ADVANCE_BATCH();
625 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
626 PIPE_CONTROL_CS_STALL);
627 }
628
629 /* Do not restore push constant packets during context restore. */
630 if (devinfo->gen >= 7)
631 gen10_emit_isp_disable(brw);
632
633 /* Emit MI_BATCH_BUFFER_END to finish our batch. Note that execbuf2
634 * requires our batch size to be QWord aligned, so we pad it out if
635 * necessary by emitting an extra MI_NOOP after the end.
636 */
637 intel_batchbuffer_require_space(brw, 8);
638 *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
639 if (USED_BATCH(brw->batch) & 1) {
640 *brw->batch.map_next++ = MI_NOOP;
641 }
642
643 brw->batch.no_wrap = false;
644 }
645
646 static void
647 throttle(struct brw_context *brw)
648 {
649 /* Wait for the swapbuffers before the one we just emitted, so we
650 * don't get too many swaps outstanding for apps that are GPU-heavy
651 * but not CPU-heavy.
652 *
653 * We're using intelDRI2Flush (called from the loader before
654 * swapbuffer) and glFlush (for front buffer rendering) as the
655 * indicator that a frame is done and then throttle when we get
656 * here as we prepare to render the next frame. At this point for
657 * round trips for swap/copy and getting new buffers are done and
658 * we'll spend less time waiting on the GPU.
659 *
660 * Unfortunately, we don't have a handle to the batch containing
661 * the swap, and getting our hands on that doesn't seem worth it,
662 * so we just use the first batch we emitted after the last swap.
663 */
664 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
665 if (brw->throttle_batch[1]) {
666 if (!brw->disable_throttling) {
667 brw_bo_wait_rendering(brw->throttle_batch[1]);
668 }
669 brw_bo_unreference(brw->throttle_batch[1]);
670 }
671 brw->throttle_batch[1] = brw->throttle_batch[0];
672 brw->throttle_batch[0] = NULL;
673 brw->need_swap_throttle = false;
674 /* Throttling here is more precise than the throttle ioctl, so skip it */
675 brw->need_flush_throttle = false;
676 }
677
678 if (brw->need_flush_throttle) {
679 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
680 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
681 brw->need_flush_throttle = false;
682 }
683 }
684
685 static int
686 execbuffer(int fd,
687 struct intel_batchbuffer *batch,
688 uint32_t ctx_id,
689 int used,
690 int in_fence,
691 int *out_fence,
692 int flags)
693 {
694 struct drm_i915_gem_execbuffer2 execbuf = {
695 .buffers_ptr = (uintptr_t) batch->validation_list,
696 .buffer_count = batch->exec_count,
697 .batch_start_offset = 0,
698 .batch_len = used,
699 .flags = flags,
700 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
701 };
702
703 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
704
705 if (in_fence != -1) {
706 execbuf.rsvd2 = in_fence;
707 execbuf.flags |= I915_EXEC_FENCE_IN;
708 }
709
710 if (out_fence != NULL) {
711 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
712 *out_fence = -1;
713 execbuf.flags |= I915_EXEC_FENCE_OUT;
714 }
715
716 int ret = drmIoctl(fd, cmd, &execbuf);
717 if (ret != 0)
718 ret = -errno;
719
720 for (int i = 0; i < batch->exec_count; i++) {
721 struct brw_bo *bo = batch->exec_bos[i];
722
723 bo->idle = false;
724 bo->index = -1;
725
726 /* Update brw_bo::gtt_offset */
727 if (batch->validation_list[i].offset != bo->gtt_offset) {
728 assert(!(bo->kflags & EXEC_OBJECT_PINNED));
729 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
730 bo->gem_handle, bo->gtt_offset,
731 batch->validation_list[i].offset);
732 bo->gtt_offset = batch->validation_list[i].offset;
733 }
734 }
735
736 if (ret == 0 && out_fence != NULL)
737 *out_fence = execbuf.rsvd2 >> 32;
738
739 return ret;
740 }
741
742 static int
743 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
744 {
745 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
746 struct intel_batchbuffer *batch = &brw->batch;
747 int ret = 0;
748
749 if (batch->use_shadow_copy) {
750 void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
751 memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
752
753 bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
754 memcpy(bo_map, batch->state.map, batch->state_used);
755 }
756
757 brw_bo_unmap(batch->batch.bo);
758 brw_bo_unmap(batch->state.bo);
759
760 if (!brw->screen->no_hw) {
761 /* The requirement for using I915_EXEC_NO_RELOC are:
762 *
763 * The addresses written in the objects must match the corresponding
764 * reloc.gtt_offset which in turn must match the corresponding
765 * execobject.offset.
766 *
767 * Any render targets written to in the batch must be flagged with
768 * EXEC_OBJECT_WRITE.
769 *
770 * To avoid stalling, execobject.offset should match the current
771 * address of that object within the active context.
772 */
773 int flags = I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
774
775 if (batch->needs_sol_reset)
776 flags |= I915_EXEC_GEN7_SOL_RESET;
777
778 /* Set statebuffer relocations */
779 const unsigned state_index = batch->state.bo->index;
780 if (state_index < batch->exec_count &&
781 batch->exec_bos[state_index] == batch->state.bo) {
782 struct drm_i915_gem_exec_object2 *entry =
783 &batch->validation_list[state_index];
784 assert(entry->handle == batch->state.bo->gem_handle);
785 entry->relocation_count = batch->state_relocs.reloc_count;
786 entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
787 }
788
789 /* Set batchbuffer relocations */
790 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
791 assert(entry->handle == batch->batch.bo->gem_handle);
792 entry->relocation_count = batch->batch_relocs.reloc_count;
793 entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
794
795 if (batch->use_batch_first) {
796 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
797 } else {
798 /* Move the batch to the end of the validation list */
799 struct drm_i915_gem_exec_object2 tmp;
800 struct brw_bo *tmp_bo;
801 const unsigned index = batch->exec_count - 1;
802
803 tmp = *entry;
804 *entry = batch->validation_list[index];
805 batch->validation_list[index] = tmp;
806
807 tmp_bo = batch->exec_bos[0];
808 batch->exec_bos[0] = batch->exec_bos[index];
809 batch->exec_bos[index] = tmp_bo;
810 }
811
812 ret = execbuffer(dri_screen->fd, batch, brw->hw_ctx,
813 4 * USED_BATCH(*batch),
814 in_fence_fd, out_fence_fd, flags);
815
816 throttle(brw);
817 }
818
819 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
820 gen_print_batch(&batch->decoder, batch->batch.map,
821 4 * USED_BATCH(*batch),
822 batch->batch.bo->gtt_offset);
823 }
824
825 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
826 brw_check_for_reset(brw);
827
828 if (ret != 0) {
829 fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
830 strerror(-ret));
831 exit(1);
832 }
833
834 return ret;
835 }
836
837 /**
838 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
839 * of the fd.
840 *
841 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
842 * of the returned fd.
843 */
844 int
845 _intel_batchbuffer_flush_fence(struct brw_context *brw,
846 int in_fence_fd, int *out_fence_fd,
847 const char *file, int line)
848 {
849 int ret;
850
851 if (USED_BATCH(brw->batch) == 0)
852 return 0;
853
854 /* Check that we didn't just wrap our batchbuffer at a bad time. */
855 assert(!brw->batch.no_wrap);
856
857 brw_finish_batch(brw);
858 brw_upload_finish(&brw->upload);
859
860 finish_growing_bos(&brw->batch.batch);
861 finish_growing_bos(&brw->batch.state);
862
863 if (brw->throttle_batch[0] == NULL) {
864 brw->throttle_batch[0] = brw->batch.batch.bo;
865 brw_bo_reference(brw->throttle_batch[0]);
866 }
867
868 if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
869 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
870 int bytes_for_state = brw->batch.state_used;
871 fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
872 " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
873 " %4d batch relocs, %4d state relocs\n", file, line,
874 bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
875 bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
876 brw->batch.exec_count,
877 (float) (brw->batch.aperture_space / (1024 * 1024)),
878 brw->batch.batch_relocs.reloc_count,
879 brw->batch.state_relocs.reloc_count);
880
881 dump_validation_list(&brw->batch);
882 }
883
884 ret = submit_batch(brw, in_fence_fd, out_fence_fd);
885
886 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
887 fprintf(stderr, "waiting for idle\n");
888 brw_bo_wait_rendering(brw->batch.batch.bo);
889 }
890
891 /* Start a new batch buffer. */
892 brw_new_batch(brw);
893
894 return ret;
895 }
896
897 bool
898 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
899 {
900 unsigned index = READ_ONCE(bo->index);
901 if (index < batch->exec_count && batch->exec_bos[index] == bo)
902 return true;
903
904 for (int i = 0; i < batch->exec_count; i++) {
905 if (batch->exec_bos[i] == bo)
906 return true;
907 }
908 return false;
909 }
910
911 /* This is the only way buffers get added to the validate list.
912 */
913 static uint64_t
914 emit_reloc(struct intel_batchbuffer *batch,
915 struct brw_reloc_list *rlist, uint32_t offset,
916 struct brw_bo *target, int32_t target_offset,
917 unsigned int reloc_flags)
918 {
919 assert(target != NULL);
920
921 if (target->kflags & EXEC_OBJECT_PINNED) {
922 brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
923 return gen_canonical_address(target->gtt_offset + target_offset);
924 }
925
926 unsigned int index = add_exec_bo(batch, target);
927 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
928
929 if (rlist->reloc_count == rlist->reloc_array_size) {
930 rlist->reloc_array_size *= 2;
931 rlist->relocs = realloc(rlist->relocs,
932 rlist->reloc_array_size *
933 sizeof(struct drm_i915_gem_relocation_entry));
934 }
935
936 if (reloc_flags & RELOC_32BIT) {
937 /* Restrict this buffer to the low 32 bits of the address space.
938 *
939 * Altering the validation list flags restricts it for this batch,
940 * but we also alter the BO's kflags to restrict it permanently
941 * (until the BO is destroyed and put back in the cache). Buffers
942 * may stay bound across batches, and we want keep it constrained.
943 */
944 target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
945 entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
946
947 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
948 reloc_flags &= ~RELOC_32BIT;
949 }
950
951 if (reloc_flags)
952 entry->flags |= reloc_flags & batch->valid_reloc_flags;
953
954 rlist->relocs[rlist->reloc_count++] =
955 (struct drm_i915_gem_relocation_entry) {
956 .offset = offset,
957 .delta = target_offset,
958 .target_handle = batch->use_batch_first ? index : target->gem_handle,
959 .presumed_offset = entry->offset,
960 };
961
962 /* Using the old buffer offset, write in what the right data would be, in
963 * case the buffer doesn't move and we can short-circuit the relocation
964 * processing in the kernel
965 */
966 return entry->offset + target_offset;
967 }
968
969 void
970 brw_use_pinned_bo(struct intel_batchbuffer *batch, struct brw_bo *bo,
971 unsigned writable_flag)
972 {
973 assert(bo->kflags & EXEC_OBJECT_PINNED);
974 assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
975
976 unsigned int index = add_exec_bo(batch, bo);
977 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
978 assert(entry->offset == bo->gtt_offset);
979
980 if (writable_flag)
981 entry->flags |= EXEC_OBJECT_WRITE;
982 }
983
984 uint64_t
985 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
986 struct brw_bo *target, uint32_t target_offset,
987 unsigned int reloc_flags)
988 {
989 assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
990
991 return emit_reloc(batch, &batch->batch_relocs, batch_offset,
992 target, target_offset, reloc_flags);
993 }
994
995 uint64_t
996 brw_state_reloc(struct intel_batchbuffer *batch, uint32_t state_offset,
997 struct brw_bo *target, uint32_t target_offset,
998 unsigned int reloc_flags)
999 {
1000 assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
1001
1002 return emit_reloc(batch, &batch->state_relocs, state_offset,
1003 target, target_offset, reloc_flags);
1004 }
1005
1006 /**
1007 * Reserve some space in the statebuffer, or flush.
1008 *
1009 * This is used to estimate when we're near the end of the batch,
1010 * so we can flush early.
1011 */
1012 void
1013 brw_require_statebuffer_space(struct brw_context *brw, int size)
1014 {
1015 if (brw->batch.state_used + size >= STATE_SZ)
1016 intel_batchbuffer_flush(brw);
1017 }
1018
1019 /**
1020 * Allocates a block of space in the batchbuffer for indirect state.
1021 */
1022 void *
1023 brw_state_batch(struct brw_context *brw,
1024 int size,
1025 int alignment,
1026 uint32_t *out_offset)
1027 {
1028 struct intel_batchbuffer *batch = &brw->batch;
1029
1030 assert(size < batch->state.bo->size);
1031
1032 uint32_t offset = ALIGN(batch->state_used, alignment);
1033
1034 if (offset + size >= STATE_SZ && !batch->no_wrap) {
1035 intel_batchbuffer_flush(brw);
1036 offset = ALIGN(batch->state_used, alignment);
1037 } else if (offset + size >= batch->state.bo->size) {
1038 const unsigned new_size =
1039 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
1040 MAX_STATE_SIZE);
1041 grow_buffer(brw, &batch->state, batch->state_used, new_size);
1042 assert(offset + size < batch->state.bo->size);
1043 }
1044
1045 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
1046 _mesa_hash_table_insert(batch->state_batch_sizes,
1047 (void *) (uintptr_t) offset,
1048 (void *) (uintptr_t) size);
1049 }
1050
1051 batch->state_used = offset + size;
1052
1053 *out_offset = offset;
1054 return batch->state.map + (offset >> 2);
1055 }
1056
1057 void
1058 intel_batchbuffer_data(struct brw_context *brw,
1059 const void *data, GLuint bytes)
1060 {
1061 assert((bytes & 3) == 0);
1062 intel_batchbuffer_require_space(brw, bytes);
1063 memcpy(brw->batch.map_next, data, bytes);
1064 brw->batch.map_next += bytes >> 2;
1065 }
1066
1067 static void
1068 load_sized_register_mem(struct brw_context *brw,
1069 uint32_t reg,
1070 struct brw_bo *bo,
1071 uint32_t offset,
1072 int size)
1073 {
1074 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1075 int i;
1076
1077 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
1078 assert(devinfo->gen >= 7);
1079
1080 if (devinfo->gen >= 8) {
1081 BEGIN_BATCH(4 * size);
1082 for (i = 0; i < size; i++) {
1083 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
1084 OUT_BATCH(reg + i * 4);
1085 OUT_RELOC64(bo, 0, offset + i * 4);
1086 }
1087 ADVANCE_BATCH();
1088 } else {
1089 BEGIN_BATCH(3 * size);
1090 for (i = 0; i < size; i++) {
1091 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
1092 OUT_BATCH(reg + i * 4);
1093 OUT_RELOC(bo, 0, offset + i * 4);
1094 }
1095 ADVANCE_BATCH();
1096 }
1097 }
1098
1099 void
1100 brw_load_register_mem(struct brw_context *brw,
1101 uint32_t reg,
1102 struct brw_bo *bo,
1103 uint32_t offset)
1104 {
1105 load_sized_register_mem(brw, reg, bo, offset, 1);
1106 }
1107
1108 void
1109 brw_load_register_mem64(struct brw_context *brw,
1110 uint32_t reg,
1111 struct brw_bo *bo,
1112 uint32_t offset)
1113 {
1114 load_sized_register_mem(brw, reg, bo, offset, 2);
1115 }
1116
1117 /*
1118 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1119 */
1120 void
1121 brw_store_register_mem32(struct brw_context *brw,
1122 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1123 {
1124 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1125
1126 assert(devinfo->gen >= 6);
1127
1128 if (devinfo->gen >= 8) {
1129 BEGIN_BATCH(4);
1130 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1131 OUT_BATCH(reg);
1132 OUT_RELOC64(bo, RELOC_WRITE, offset);
1133 ADVANCE_BATCH();
1134 } else {
1135 BEGIN_BATCH(3);
1136 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1137 OUT_BATCH(reg);
1138 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1139 ADVANCE_BATCH();
1140 }
1141 }
1142
1143 /*
1144 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1145 */
1146 void
1147 brw_store_register_mem64(struct brw_context *brw,
1148 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1149 {
1150 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1151
1152 assert(devinfo->gen >= 6);
1153
1154 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1155 * read a full 64-bit register, we need to do two of them.
1156 */
1157 if (devinfo->gen >= 8) {
1158 BEGIN_BATCH(8);
1159 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1160 OUT_BATCH(reg);
1161 OUT_RELOC64(bo, RELOC_WRITE, offset);
1162 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1163 OUT_BATCH(reg + sizeof(uint32_t));
1164 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1165 ADVANCE_BATCH();
1166 } else {
1167 BEGIN_BATCH(6);
1168 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1169 OUT_BATCH(reg);
1170 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1171 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1172 OUT_BATCH(reg + sizeof(uint32_t));
1173 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1174 ADVANCE_BATCH();
1175 }
1176 }
1177
1178 /*
1179 * Write a 32-bit register using immediate data.
1180 */
1181 void
1182 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1183 {
1184 assert(brw->screen->devinfo.gen >= 6);
1185
1186 BEGIN_BATCH(3);
1187 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1188 OUT_BATCH(reg);
1189 OUT_BATCH(imm);
1190 ADVANCE_BATCH();
1191 }
1192
1193 /*
1194 * Write a 64-bit register using immediate data.
1195 */
1196 void
1197 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1198 {
1199 assert(brw->screen->devinfo.gen >= 6);
1200
1201 BEGIN_BATCH(5);
1202 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1203 OUT_BATCH(reg);
1204 OUT_BATCH(imm & 0xffffffff);
1205 OUT_BATCH(reg + 4);
1206 OUT_BATCH(imm >> 32);
1207 ADVANCE_BATCH();
1208 }
1209
1210 /*
1211 * Copies a 32-bit register.
1212 */
1213 void
1214 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
1215 {
1216 assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1217
1218 BEGIN_BATCH(3);
1219 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1220 OUT_BATCH(src);
1221 OUT_BATCH(dest);
1222 ADVANCE_BATCH();
1223 }
1224
1225 /*
1226 * Copies a 64-bit register.
1227 */
1228 void
1229 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1230 {
1231 assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1232
1233 BEGIN_BATCH(6);
1234 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1235 OUT_BATCH(src);
1236 OUT_BATCH(dest);
1237 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1238 OUT_BATCH(src + sizeof(uint32_t));
1239 OUT_BATCH(dest + sizeof(uint32_t));
1240 ADVANCE_BATCH();
1241 }
1242
1243 /*
1244 * Write 32-bits of immediate data to a GPU memory buffer.
1245 */
1246 void
1247 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1248 uint32_t offset, uint32_t imm)
1249 {
1250 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1251
1252 assert(devinfo->gen >= 6);
1253
1254 BEGIN_BATCH(4);
1255 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1256 if (devinfo->gen >= 8)
1257 OUT_RELOC64(bo, RELOC_WRITE, offset);
1258 else {
1259 OUT_BATCH(0); /* MBZ */
1260 OUT_RELOC(bo, RELOC_WRITE, offset);
1261 }
1262 OUT_BATCH(imm);
1263 ADVANCE_BATCH();
1264 }
1265
1266 /*
1267 * Write 64-bits of immediate data to a GPU memory buffer.
1268 */
1269 void
1270 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1271 uint32_t offset, uint64_t imm)
1272 {
1273 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1274
1275 assert(devinfo->gen >= 6);
1276
1277 BEGIN_BATCH(5);
1278 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1279 if (devinfo->gen >= 8)
1280 OUT_RELOC64(bo, RELOC_WRITE, offset);
1281 else {
1282 OUT_BATCH(0); /* MBZ */
1283 OUT_RELOC(bo, RELOC_WRITE, offset);
1284 }
1285 OUT_BATCH(imm & 0xffffffffu);
1286 OUT_BATCH(imm >> 32);
1287 ADVANCE_BATCH();
1288 }