i965: Prepare batchbuffer module for softpin support.
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35
36 #include "util/hash_table.h"
37
38 #include <xf86drm.h>
39 #include <i915_drm.h>
40
41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
42
43 /**
44 * Target sizes of the batch and state buffers. We create the initial
45 * buffers at these sizes, and flush when they're nearly full. If we
46 * underestimate how close we are to the end, and suddenly need more space
47 * in the middle of a draw, we can grow the buffers, and finish the draw.
48 * At that point, we'll be over our target size, so the next operation
49 * should flush. Each time we flush the batch, we recreate both buffers
50 * at the original target size, so it doesn't grow without bound.
51 */
52 #define BATCH_SZ (20 * 1024)
53 #define STATE_SZ (16 * 1024)
54
55 static void
56 intel_batchbuffer_reset(struct brw_context *brw);
57
58 static void
59 dump_validation_list(struct intel_batchbuffer *batch)
60 {
61 fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
62
63 for (int i = 0; i < batch->exec_count; i++) {
64 uint64_t flags = batch->validation_list[i].flags;
65 assert(batch->validation_list[i].handle ==
66 batch->exec_bos[i]->gem_handle);
67 fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%016llx%s (%"PRIu64"B)\n",
68 i,
69 batch->validation_list[i].handle,
70 batch->exec_bos[i]->name,
71 batch->exec_bos[i],
72 (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b",
73 (flags & EXEC_OBJECT_WRITE) ? " write)" : ")",
74 batch->validation_list[i].offset,
75 (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "",
76 batch->exec_bos[i]->size);
77 }
78 }
79
80 static struct gen_batch_decode_bo
81 decode_get_bo(void *v_brw, uint64_t address)
82 {
83 struct brw_context *brw = v_brw;
84 struct intel_batchbuffer *batch = &brw->batch;
85
86 for (int i = 0; i < batch->exec_count; i++) {
87 struct brw_bo *bo = batch->exec_bos[i];
88 /* The decoder zeroes out the top 16 bits, so we need to as well */
89 uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
90
91 if (address >= bo_address && address < bo_address + bo->size) {
92 return (struct gen_batch_decode_bo) {
93 .addr = address,
94 .size = bo->size,
95 .map = brw_bo_map(brw, bo, MAP_READ) + (address - bo_address),
96 };
97 }
98 }
99
100 return (struct gen_batch_decode_bo) { };
101 }
102
103 static unsigned
104 decode_get_state_size(void *v_brw, uint32_t offset_from_dsba)
105 {
106 struct brw_context *brw = v_brw;
107 struct intel_batchbuffer *batch = &brw->batch;
108 struct hash_entry *entry =
109 _mesa_hash_table_search(batch->state_batch_sizes,
110 (void *) (uintptr_t) offset_from_dsba);
111 return entry ? (uintptr_t) entry->data : 0;
112 }
113
114 static bool
115 uint_key_compare(const void *a, const void *b)
116 {
117 return a == b;
118 }
119
120 static uint32_t
121 uint_key_hash(const void *key)
122 {
123 return (uintptr_t) key;
124 }
125
126 static void
127 init_reloc_list(struct brw_reloc_list *rlist, int count)
128 {
129 rlist->reloc_count = 0;
130 rlist->reloc_array_size = count;
131 rlist->relocs = malloc(rlist->reloc_array_size *
132 sizeof(struct drm_i915_gem_relocation_entry));
133 }
134
135 void
136 intel_batchbuffer_init(struct brw_context *brw)
137 {
138 struct intel_screen *screen = brw->screen;
139 struct intel_batchbuffer *batch = &brw->batch;
140 const struct gen_device_info *devinfo = &screen->devinfo;
141
142 batch->use_shadow_copy = !devinfo->has_llc;
143
144 init_reloc_list(&batch->batch_relocs, 250);
145 init_reloc_list(&batch->state_relocs, 250);
146
147 batch->batch.map = NULL;
148 batch->state.map = NULL;
149 batch->exec_count = 0;
150 batch->exec_array_size = 100;
151 batch->exec_bos =
152 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
153 batch->validation_list =
154 malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
155
156 if (INTEL_DEBUG & DEBUG_BATCH) {
157 batch->state_batch_sizes =
158 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
159
160 const unsigned decode_flags =
161 GEN_BATCH_DECODE_FULL |
162 ((INTEL_DEBUG & DEBUG_COLOR) ? GEN_BATCH_DECODE_IN_COLOR : 0) |
163 GEN_BATCH_DECODE_OFFSETS |
164 GEN_BATCH_DECODE_FLOATS;
165
166 gen_batch_decode_ctx_init(&batch->decoder, devinfo, stderr,
167 decode_flags, NULL, decode_get_bo,
168 decode_get_state_size, brw);
169 batch->decoder.max_vbo_decoded_lines = 100;
170 }
171
172 batch->use_batch_first =
173 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
174
175 /* PIPE_CONTROL needs a w/a but only on gen6 */
176 batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
177 if (devinfo->gen == 6)
178 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
179
180 intel_batchbuffer_reset(brw);
181 }
182
183 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
184
185 static unsigned
186 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
187 {
188 unsigned index = READ_ONCE(bo->index);
189
190 if (index < batch->exec_count && batch->exec_bos[index] == bo)
191 return index;
192
193 /* May have been shared between multiple active batches */
194 for (index = 0; index < batch->exec_count; index++) {
195 if (batch->exec_bos[index] == bo)
196 return index;
197 }
198
199 brw_bo_reference(bo);
200
201 if (batch->exec_count == batch->exec_array_size) {
202 batch->exec_array_size *= 2;
203 batch->exec_bos =
204 realloc(batch->exec_bos,
205 batch->exec_array_size * sizeof(batch->exec_bos[0]));
206 batch->validation_list =
207 realloc(batch->validation_list,
208 batch->exec_array_size * sizeof(batch->validation_list[0]));
209 }
210
211 batch->validation_list[batch->exec_count] =
212 (struct drm_i915_gem_exec_object2) {
213 .handle = bo->gem_handle,
214 .offset = bo->gtt_offset,
215 .flags = bo->kflags,
216 };
217
218 bo->index = batch->exec_count;
219 batch->exec_bos[batch->exec_count] = bo;
220 batch->aperture_space += bo->size;
221
222 return batch->exec_count++;
223 }
224
225 static void
226 recreate_growing_buffer(struct brw_context *brw,
227 struct brw_growing_bo *grow,
228 const char *name, unsigned size,
229 enum brw_memory_zone memzone)
230 {
231 struct intel_screen *screen = brw->screen;
232 struct intel_batchbuffer *batch = &brw->batch;
233 struct brw_bufmgr *bufmgr = screen->bufmgr;
234
235 /* We can't grow buffers when using softpin, so just overallocate them. */
236 if (brw_using_softpin(bufmgr))
237 size *= 2;
238
239 grow->bo = brw_bo_alloc(bufmgr, name, size, memzone);
240 grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
241 grow->partial_bo = NULL;
242 grow->partial_bo_map = NULL;
243 grow->partial_bytes = 0;
244 grow->memzone = memzone;
245
246 if (batch->use_shadow_copy)
247 grow->map = realloc(grow->map, grow->bo->size);
248 else
249 grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
250 }
251
252 static void
253 intel_batchbuffer_reset(struct brw_context *brw)
254 {
255 struct intel_batchbuffer *batch = &brw->batch;
256
257 if (batch->last_bo != NULL) {
258 brw_bo_unreference(batch->last_bo);
259 batch->last_bo = NULL;
260 }
261 batch->last_bo = batch->batch.bo;
262
263 recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ,
264 BRW_MEMZONE_OTHER);
265 batch->map_next = batch->batch.map;
266
267 recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ,
268 BRW_MEMZONE_DYNAMIC);
269
270 /* Avoid making 0 a valid state offset - otherwise the decoder will try
271 * and decode data when we use offset 0 as a null pointer.
272 */
273 batch->state_used = 1;
274
275 add_exec_bo(batch, batch->batch.bo);
276 assert(batch->batch.bo->index == 0);
277
278 batch->needs_sol_reset = false;
279 batch->state_base_address_emitted = false;
280
281 if (batch->state_batch_sizes)
282 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
283 }
284
285 static void
286 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
287 {
288 intel_batchbuffer_reset(brw);
289 brw_cache_sets_clear(brw);
290 }
291
292 void
293 intel_batchbuffer_save_state(struct brw_context *brw)
294 {
295 brw->batch.saved.map_next = brw->batch.map_next;
296 brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
297 brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
298 brw->batch.saved.exec_count = brw->batch.exec_count;
299 }
300
301 void
302 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
303 {
304 for (int i = brw->batch.saved.exec_count;
305 i < brw->batch.exec_count; i++) {
306 brw_bo_unreference(brw->batch.exec_bos[i]);
307 }
308 brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
309 brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
310 brw->batch.exec_count = brw->batch.saved.exec_count;
311
312 brw->batch.map_next = brw->batch.saved.map_next;
313 }
314
315 void
316 intel_batchbuffer_free(struct intel_batchbuffer *batch)
317 {
318 if (batch->use_shadow_copy) {
319 free(batch->batch.map);
320 free(batch->state.map);
321 }
322
323 for (int i = 0; i < batch->exec_count; i++) {
324 brw_bo_unreference(batch->exec_bos[i]);
325 }
326 free(batch->batch_relocs.relocs);
327 free(batch->state_relocs.relocs);
328 free(batch->exec_bos);
329 free(batch->validation_list);
330
331 brw_bo_unreference(batch->last_bo);
332 brw_bo_unreference(batch->batch.bo);
333 brw_bo_unreference(batch->state.bo);
334 if (batch->state_batch_sizes) {
335 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
336 gen_batch_decode_ctx_finish(&batch->decoder);
337 }
338 }
339
340 /**
341 * Finish copying the old batch/state buffer's contents to the new one
342 * after we tried to "grow" the buffer in an earlier operation.
343 */
344 static void
345 finish_growing_bos(struct brw_growing_bo *grow)
346 {
347 struct brw_bo *old_bo = grow->partial_bo;
348 if (!old_bo)
349 return;
350
351 memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
352
353 grow->partial_bo = NULL;
354 grow->partial_bo_map = NULL;
355 grow->partial_bytes = 0;
356
357 brw_bo_unreference(old_bo);
358 }
359
360 static void
361 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
362 uint32_t old_handle, uint32_t new_handle)
363 {
364 for (int i = 0; i < rlist->reloc_count; i++) {
365 if (rlist->relocs[i].target_handle == old_handle)
366 rlist->relocs[i].target_handle = new_handle;
367 }
368 }
369
370 /**
371 * Grow either the batch or state buffer to a new larger size.
372 *
373 * We can't actually grow buffers, so we allocate a new one, copy over
374 * the existing contents, and update our lists to refer to the new one.
375 *
376 * Note that this is only temporary - each new batch recreates the buffers
377 * at their original target size (BATCH_SZ or STATE_SZ).
378 */
379 static void
380 grow_buffer(struct brw_context *brw,
381 struct brw_growing_bo *grow,
382 unsigned existing_bytes,
383 unsigned new_size)
384 {
385 struct intel_batchbuffer *batch = &brw->batch;
386 struct brw_bufmgr *bufmgr = brw->bufmgr;
387 struct brw_bo *bo = grow->bo;
388
389 /* We can't grow buffers that are softpinned, as the growing mechanism
390 * involves putting a larger buffer at the same gtt_offset...and we've
391 * only allocated the smaller amount of VMA. Without relocations, this
392 * simply won't work. This should never happen, however.
393 */
394 assert(!(bo->kflags & EXEC_OBJECT_PINNED));
395
396 perf_debug("Growing %s - ran out of space\n", bo->name);
397
398 if (grow->partial_bo) {
399 /* We've already grown once, and now we need to do it again.
400 * Finish our last grow operation so we can start a new one.
401 * This should basically never happen.
402 */
403 perf_debug("Had to grow multiple times");
404 finish_growing_bos(grow);
405 }
406
407 struct brw_bo *new_bo =
408 brw_bo_alloc(bufmgr, bo->name, new_size, grow->memzone);
409
410 /* Copy existing data to the new larger buffer */
411 grow->partial_bo_map = grow->map;
412
413 if (batch->use_shadow_copy) {
414 /* We can't safely use realloc, as it may move the existing buffer,
415 * breaking existing pointers the caller may still be using. Just
416 * malloc a new copy and memcpy it like the normal BO path.
417 *
418 * Use bo->size rather than new_size because the bufmgr may have
419 * rounded up the size, and we want the shadow size to match.
420 */
421 grow->map = malloc(new_bo->size);
422 } else {
423 grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
424 }
425
426 /* Try to put the new BO at the same GTT offset as the old BO (which
427 * we're throwing away, so it doesn't need to be there).
428 *
429 * This guarantees that our relocations continue to work: values we've
430 * already written into the buffer, values we're going to write into the
431 * buffer, and the validation/relocation lists all will match.
432 *
433 * Also preserve kflags for EXEC_OBJECT_CAPTURE.
434 */
435 new_bo->gtt_offset = bo->gtt_offset;
436 new_bo->index = bo->index;
437 new_bo->kflags = bo->kflags;
438
439 /* Batch/state buffers are per-context, and if we've run out of space,
440 * we must have actually used them before, so...they will be in the list.
441 */
442 assert(bo->index < batch->exec_count);
443 assert(batch->exec_bos[bo->index] == bo);
444
445 /* Update the validation list to use the new BO. */
446 batch->validation_list[bo->index].handle = new_bo->gem_handle;
447
448 if (!batch->use_batch_first) {
449 /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
450 * update the relocation list entries to point at the new BO as well.
451 * (With newer kernels, the "handle" is an offset into the validation
452 * list, which remains unchanged, so we can skip this.)
453 */
454 replace_bo_in_reloc_list(&batch->batch_relocs,
455 bo->gem_handle, new_bo->gem_handle);
456 replace_bo_in_reloc_list(&batch->state_relocs,
457 bo->gem_handle, new_bo->gem_handle);
458 }
459
460 /* Exchange the two BOs...without breaking pointers to the old BO.
461 *
462 * Consider this scenario:
463 *
464 * 1. Somebody calls brw_state_batch() to get a region of memory, and
465 * and then creates a brw_address pointing to brw->batch.state.bo.
466 * 2. They then call brw_state_batch() a second time, which happens to
467 * grow and replace the state buffer. They then try to emit a
468 * relocation to their first section of memory.
469 *
470 * If we replace the brw->batch.state.bo pointer at step 2, we would
471 * break the address created in step 1. They'd have a pointer to the
472 * old destroyed BO. Emitting a relocation would add this dead BO to
473 * the validation list...causing /both/ statebuffers to be in the list,
474 * and all kinds of disasters.
475 *
476 * This is not a contrived case - BLORP vertex data upload hits this.
477 *
478 * There are worse scenarios too. Fences for GL sync objects reference
479 * brw->batch.batch.bo. If we replaced the batch pointer when growing,
480 * we'd need to chase down every fence and update it to point to the
481 * new BO. Otherwise, it would refer to a "batch" that never actually
482 * gets submitted, and would fail to trigger.
483 *
484 * To work around both of these issues, we transmutate the buffers in
485 * place, making the existing struct brw_bo represent the new buffer,
486 * and "new_bo" represent the old BO. This is highly unusual, but it
487 * seems like a necessary evil.
488 *
489 * We also defer the memcpy of the existing batch's contents. Callers
490 * may make multiple brw_state_batch calls, and retain pointers to the
491 * old BO's map. We'll perform the memcpy in finish_growing_bo() when
492 * we finally submit the batch, at which point we've finished uploading
493 * state, and nobody should have any old references anymore.
494 *
495 * To do that, we keep a reference to the old BO in grow->partial_bo,
496 * and store the number of bytes to copy in grow->partial_bytes. We
497 * can monkey with the refcounts directly without atomics because these
498 * are per-context BOs and they can only be touched by this thread.
499 */
500 assert(new_bo->refcount == 1);
501 new_bo->refcount = bo->refcount;
502 bo->refcount = 1;
503
504 struct brw_bo tmp;
505 memcpy(&tmp, bo, sizeof(struct brw_bo));
506 memcpy(bo, new_bo, sizeof(struct brw_bo));
507 memcpy(new_bo, &tmp, sizeof(struct brw_bo));
508
509 grow->partial_bo = new_bo; /* the one reference of the OLD bo */
510 grow->partial_bytes = existing_bytes;
511 }
512
513 void
514 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz)
515 {
516 struct intel_batchbuffer *batch = &brw->batch;
517
518 const unsigned batch_used = USED_BATCH(*batch) * 4;
519 if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
520 intel_batchbuffer_flush(brw);
521 } else if (batch_used + sz >= batch->batch.bo->size) {
522 const unsigned new_size =
523 MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
524 MAX_BATCH_SIZE);
525 grow_buffer(brw, &batch->batch, batch_used, new_size);
526 batch->map_next = (void *) batch->batch.map + batch_used;
527 assert(batch_used + sz < batch->batch.bo->size);
528 }
529 }
530
531 /**
532 * Called when starting a new batch buffer.
533 */
534 static void
535 brw_new_batch(struct brw_context *brw)
536 {
537 /* Unreference any BOs held by the previous batch, and reset counts. */
538 for (int i = 0; i < brw->batch.exec_count; i++) {
539 brw_bo_unreference(brw->batch.exec_bos[i]);
540 brw->batch.exec_bos[i] = NULL;
541 }
542 brw->batch.batch_relocs.reloc_count = 0;
543 brw->batch.state_relocs.reloc_count = 0;
544 brw->batch.exec_count = 0;
545 brw->batch.aperture_space = 0;
546
547 brw_bo_unreference(brw->batch.state.bo);
548
549 /* Create a new batchbuffer and reset the associated state: */
550 intel_batchbuffer_reset_and_clear_render_cache(brw);
551
552 /* If the kernel supports hardware contexts, then most hardware state is
553 * preserved between batches; we only need to re-emit state that is required
554 * to be in every batch. Otherwise we need to re-emit all the state that
555 * would otherwise be stored in the context (which for all intents and
556 * purposes means everything).
557 */
558 if (brw->hw_ctx == 0) {
559 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
560 brw_upload_invariant_state(brw);
561 }
562
563 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
564
565 brw->ib.index_size = -1;
566
567 /* We need to periodically reap the shader time results, because rollover
568 * happens every few seconds. We also want to see results every once in a
569 * while, because many programs won't cleanly destroy our context, so the
570 * end-of-run printout may not happen.
571 */
572 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
573 brw_collect_and_report_shader_time(brw);
574 }
575
576 /**
577 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
578 * sending it off.
579 *
580 * This function can emit state (say, to preserve registers that aren't saved
581 * between batches).
582 */
583 static void
584 brw_finish_batch(struct brw_context *brw)
585 {
586 const struct gen_device_info *devinfo = &brw->screen->devinfo;
587
588 brw->batch.no_wrap = true;
589
590 /* Capture the closing pipeline statistics register values necessary to
591 * support query objects (in the non-hardware context world).
592 */
593 brw_emit_query_end(brw);
594
595 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
596 * assume that the L3 cache is configured according to the hardware
597 * defaults. On Kernel 4.16+, we no longer need to do this.
598 */
599 if (devinfo->gen >= 7 &&
600 !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
601 gen7_restore_default_l3_config(brw);
602
603 if (devinfo->is_haswell) {
604 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
605 * 3DSTATE_CC_STATE_POINTERS > "Note":
606 *
607 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
608 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
609 *
610 * From the example in the docs, it seems to expect a regular pipe control
611 * flush here as well. We may have done it already, but meh.
612 *
613 * See also WaAvoidRCZCounterRollover.
614 */
615 brw_emit_mi_flush(brw);
616 BEGIN_BATCH(2);
617 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
618 OUT_BATCH(brw->cc.state_offset | 1);
619 ADVANCE_BATCH();
620 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
621 PIPE_CONTROL_CS_STALL);
622 }
623
624 /* Do not restore push constant packets during context restore. */
625 if (devinfo->gen >= 7)
626 gen10_emit_isp_disable(brw);
627
628 /* Emit MI_BATCH_BUFFER_END to finish our batch. Note that execbuf2
629 * requires our batch size to be QWord aligned, so we pad it out if
630 * necessary by emitting an extra MI_NOOP after the end.
631 */
632 intel_batchbuffer_require_space(brw, 8);
633 *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
634 if (USED_BATCH(brw->batch) & 1) {
635 *brw->batch.map_next++ = MI_NOOP;
636 }
637
638 brw->batch.no_wrap = false;
639 }
640
641 static void
642 throttle(struct brw_context *brw)
643 {
644 /* Wait for the swapbuffers before the one we just emitted, so we
645 * don't get too many swaps outstanding for apps that are GPU-heavy
646 * but not CPU-heavy.
647 *
648 * We're using intelDRI2Flush (called from the loader before
649 * swapbuffer) and glFlush (for front buffer rendering) as the
650 * indicator that a frame is done and then throttle when we get
651 * here as we prepare to render the next frame. At this point for
652 * round trips for swap/copy and getting new buffers are done and
653 * we'll spend less time waiting on the GPU.
654 *
655 * Unfortunately, we don't have a handle to the batch containing
656 * the swap, and getting our hands on that doesn't seem worth it,
657 * so we just use the first batch we emitted after the last swap.
658 */
659 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
660 if (brw->throttle_batch[1]) {
661 if (!brw->disable_throttling) {
662 brw_bo_wait_rendering(brw->throttle_batch[1]);
663 }
664 brw_bo_unreference(brw->throttle_batch[1]);
665 }
666 brw->throttle_batch[1] = brw->throttle_batch[0];
667 brw->throttle_batch[0] = NULL;
668 brw->need_swap_throttle = false;
669 /* Throttling here is more precise than the throttle ioctl, so skip it */
670 brw->need_flush_throttle = false;
671 }
672
673 if (brw->need_flush_throttle) {
674 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
675 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
676 brw->need_flush_throttle = false;
677 }
678 }
679
680 static int
681 execbuffer(int fd,
682 struct intel_batchbuffer *batch,
683 uint32_t ctx_id,
684 int used,
685 int in_fence,
686 int *out_fence,
687 int flags)
688 {
689 struct drm_i915_gem_execbuffer2 execbuf = {
690 .buffers_ptr = (uintptr_t) batch->validation_list,
691 .buffer_count = batch->exec_count,
692 .batch_start_offset = 0,
693 .batch_len = used,
694 .flags = flags,
695 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
696 };
697
698 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
699
700 if (in_fence != -1) {
701 execbuf.rsvd2 = in_fence;
702 execbuf.flags |= I915_EXEC_FENCE_IN;
703 }
704
705 if (out_fence != NULL) {
706 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
707 *out_fence = -1;
708 execbuf.flags |= I915_EXEC_FENCE_OUT;
709 }
710
711 int ret = drmIoctl(fd, cmd, &execbuf);
712 if (ret != 0)
713 ret = -errno;
714
715 for (int i = 0; i < batch->exec_count; i++) {
716 struct brw_bo *bo = batch->exec_bos[i];
717
718 bo->idle = false;
719 bo->index = -1;
720
721 /* Update brw_bo::gtt_offset */
722 if (batch->validation_list[i].offset != bo->gtt_offset) {
723 assert(!(bo->kflags & EXEC_OBJECT_PINNED));
724 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
725 bo->gem_handle, bo->gtt_offset,
726 batch->validation_list[i].offset);
727 bo->gtt_offset = batch->validation_list[i].offset;
728 }
729 }
730
731 if (ret == 0 && out_fence != NULL)
732 *out_fence = execbuf.rsvd2 >> 32;
733
734 return ret;
735 }
736
737 static int
738 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
739 {
740 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
741 struct intel_batchbuffer *batch = &brw->batch;
742 int ret = 0;
743
744 if (batch->use_shadow_copy) {
745 void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
746 memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
747
748 bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
749 memcpy(bo_map, batch->state.map, batch->state_used);
750 }
751
752 brw_bo_unmap(batch->batch.bo);
753 brw_bo_unmap(batch->state.bo);
754
755 if (!brw->screen->no_hw) {
756 /* The requirement for using I915_EXEC_NO_RELOC are:
757 *
758 * The addresses written in the objects must match the corresponding
759 * reloc.gtt_offset which in turn must match the corresponding
760 * execobject.offset.
761 *
762 * Any render targets written to in the batch must be flagged with
763 * EXEC_OBJECT_WRITE.
764 *
765 * To avoid stalling, execobject.offset should match the current
766 * address of that object within the active context.
767 */
768 int flags = I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
769
770 if (batch->needs_sol_reset)
771 flags |= I915_EXEC_GEN7_SOL_RESET;
772
773 /* Set statebuffer relocations */
774 const unsigned state_index = batch->state.bo->index;
775 if (state_index < batch->exec_count &&
776 batch->exec_bos[state_index] == batch->state.bo) {
777 struct drm_i915_gem_exec_object2 *entry =
778 &batch->validation_list[state_index];
779 assert(entry->handle == batch->state.bo->gem_handle);
780 entry->relocation_count = batch->state_relocs.reloc_count;
781 entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
782 }
783
784 /* Set batchbuffer relocations */
785 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
786 assert(entry->handle == batch->batch.bo->gem_handle);
787 entry->relocation_count = batch->batch_relocs.reloc_count;
788 entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
789
790 if (batch->use_batch_first) {
791 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
792 } else {
793 /* Move the batch to the end of the validation list */
794 struct drm_i915_gem_exec_object2 tmp;
795 struct brw_bo *tmp_bo;
796 const unsigned index = batch->exec_count - 1;
797
798 tmp = *entry;
799 *entry = batch->validation_list[index];
800 batch->validation_list[index] = tmp;
801
802 tmp_bo = batch->exec_bos[0];
803 batch->exec_bos[0] = batch->exec_bos[index];
804 batch->exec_bos[index] = tmp_bo;
805 }
806
807 ret = execbuffer(dri_screen->fd, batch, brw->hw_ctx,
808 4 * USED_BATCH(*batch),
809 in_fence_fd, out_fence_fd, flags);
810
811 throttle(brw);
812 }
813
814 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
815 gen_print_batch(&batch->decoder, batch->batch.map,
816 4 * USED_BATCH(*batch),
817 batch->batch.bo->gtt_offset);
818 }
819
820 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
821 brw_check_for_reset(brw);
822
823 if (ret != 0) {
824 fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
825 strerror(-ret));
826 exit(1);
827 }
828
829 return ret;
830 }
831
832 /**
833 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
834 * of the fd.
835 *
836 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
837 * of the returned fd.
838 */
839 int
840 _intel_batchbuffer_flush_fence(struct brw_context *brw,
841 int in_fence_fd, int *out_fence_fd,
842 const char *file, int line)
843 {
844 int ret;
845
846 if (USED_BATCH(brw->batch) == 0)
847 return 0;
848
849 /* Check that we didn't just wrap our batchbuffer at a bad time. */
850 assert(!brw->batch.no_wrap);
851
852 brw_finish_batch(brw);
853 brw_upload_finish(&brw->upload);
854
855 finish_growing_bos(&brw->batch.batch);
856 finish_growing_bos(&brw->batch.state);
857
858 if (brw->throttle_batch[0] == NULL) {
859 brw->throttle_batch[0] = brw->batch.batch.bo;
860 brw_bo_reference(brw->throttle_batch[0]);
861 }
862
863 if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
864 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
865 int bytes_for_state = brw->batch.state_used;
866 fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
867 " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
868 " %4d batch relocs, %4d state relocs\n", file, line,
869 bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
870 bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
871 brw->batch.exec_count,
872 (float) brw->batch.aperture_space / (1024 * 1024),
873 brw->batch.batch_relocs.reloc_count,
874 brw->batch.state_relocs.reloc_count);
875
876 dump_validation_list(&brw->batch);
877 }
878
879 ret = submit_batch(brw, in_fence_fd, out_fence_fd);
880
881 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
882 fprintf(stderr, "waiting for idle\n");
883 brw_bo_wait_rendering(brw->batch.batch.bo);
884 }
885
886 /* Start a new batch buffer. */
887 brw_new_batch(brw);
888
889 return ret;
890 }
891
892 bool
893 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
894 {
895 return brw->batch.aperture_space + extra_space <=
896 brw->screen->aperture_threshold;
897 }
898
899 bool
900 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
901 {
902 unsigned index = READ_ONCE(bo->index);
903 if (index < batch->exec_count && batch->exec_bos[index] == bo)
904 return true;
905
906 for (int i = 0; i < batch->exec_count; i++) {
907 if (batch->exec_bos[i] == bo)
908 return true;
909 }
910 return false;
911 }
912
913 /* This is the only way buffers get added to the validate list.
914 */
915 static uint64_t
916 emit_reloc(struct intel_batchbuffer *batch,
917 struct brw_reloc_list *rlist, uint32_t offset,
918 struct brw_bo *target, int32_t target_offset,
919 unsigned int reloc_flags)
920 {
921 assert(target != NULL);
922
923 if (target->kflags & EXEC_OBJECT_PINNED) {
924 brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
925 return target->gtt_offset + target_offset;
926 }
927
928 unsigned int index = add_exec_bo(batch, target);
929 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
930
931 if (rlist->reloc_count == rlist->reloc_array_size) {
932 rlist->reloc_array_size *= 2;
933 rlist->relocs = realloc(rlist->relocs,
934 rlist->reloc_array_size *
935 sizeof(struct drm_i915_gem_relocation_entry));
936 }
937
938 if (reloc_flags & RELOC_32BIT) {
939 /* Restrict this buffer to the low 32 bits of the address space.
940 *
941 * Altering the validation list flags restricts it for this batch,
942 * but we also alter the BO's kflags to restrict it permanently
943 * (until the BO is destroyed and put back in the cache). Buffers
944 * may stay bound across batches, and we want keep it constrained.
945 */
946 target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
947 entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
948
949 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
950 reloc_flags &= ~RELOC_32BIT;
951 }
952
953 if (reloc_flags)
954 entry->flags |= reloc_flags & batch->valid_reloc_flags;
955
956 rlist->relocs[rlist->reloc_count++] =
957 (struct drm_i915_gem_relocation_entry) {
958 .offset = offset,
959 .delta = target_offset,
960 .target_handle = batch->use_batch_first ? index : target->gem_handle,
961 .presumed_offset = entry->offset,
962 };
963
964 /* Using the old buffer offset, write in what the right data would be, in
965 * case the buffer doesn't move and we can short-circuit the relocation
966 * processing in the kernel
967 */
968 return entry->offset + target_offset;
969 }
970
971 void
972 brw_use_pinned_bo(struct intel_batchbuffer *batch, struct brw_bo *bo,
973 unsigned writable_flag)
974 {
975 assert(bo->kflags & EXEC_OBJECT_PINNED);
976 assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
977
978 unsigned int index = add_exec_bo(batch, bo);
979 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
980 assert(entry->offset == bo->gtt_offset);
981
982 if (writable_flag)
983 entry->flags |= EXEC_OBJECT_WRITE;
984 }
985
986 uint64_t
987 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
988 struct brw_bo *target, uint32_t target_offset,
989 unsigned int reloc_flags)
990 {
991 assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
992
993 return emit_reloc(batch, &batch->batch_relocs, batch_offset,
994 target, target_offset, reloc_flags);
995 }
996
997 uint64_t
998 brw_state_reloc(struct intel_batchbuffer *batch, uint32_t state_offset,
999 struct brw_bo *target, uint32_t target_offset,
1000 unsigned int reloc_flags)
1001 {
1002 assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
1003
1004 return emit_reloc(batch, &batch->state_relocs, state_offset,
1005 target, target_offset, reloc_flags);
1006 }
1007
1008 /**
1009 * Reserve some space in the statebuffer, or flush.
1010 *
1011 * This is used to estimate when we're near the end of the batch,
1012 * so we can flush early.
1013 */
1014 void
1015 brw_require_statebuffer_space(struct brw_context *brw, int size)
1016 {
1017 if (brw->batch.state_used + size >= STATE_SZ)
1018 intel_batchbuffer_flush(brw);
1019 }
1020
1021 /**
1022 * Allocates a block of space in the batchbuffer for indirect state.
1023 */
1024 void *
1025 brw_state_batch(struct brw_context *brw,
1026 int size,
1027 int alignment,
1028 uint32_t *out_offset)
1029 {
1030 struct intel_batchbuffer *batch = &brw->batch;
1031
1032 assert(size < batch->state.bo->size);
1033
1034 uint32_t offset = ALIGN(batch->state_used, alignment);
1035
1036 if (offset + size >= STATE_SZ && !batch->no_wrap) {
1037 intel_batchbuffer_flush(brw);
1038 offset = ALIGN(batch->state_used, alignment);
1039 } else if (offset + size >= batch->state.bo->size) {
1040 const unsigned new_size =
1041 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
1042 MAX_STATE_SIZE);
1043 grow_buffer(brw, &batch->state, batch->state_used, new_size);
1044 assert(offset + size < batch->state.bo->size);
1045 }
1046
1047 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
1048 _mesa_hash_table_insert(batch->state_batch_sizes,
1049 (void *) (uintptr_t) offset,
1050 (void *) (uintptr_t) size);
1051 }
1052
1053 batch->state_used = offset + size;
1054
1055 *out_offset = offset;
1056 return batch->state.map + (offset >> 2);
1057 }
1058
1059 void
1060 intel_batchbuffer_data(struct brw_context *brw,
1061 const void *data, GLuint bytes)
1062 {
1063 assert((bytes & 3) == 0);
1064 intel_batchbuffer_require_space(brw, bytes);
1065 memcpy(brw->batch.map_next, data, bytes);
1066 brw->batch.map_next += bytes >> 2;
1067 }
1068
1069 static void
1070 load_sized_register_mem(struct brw_context *brw,
1071 uint32_t reg,
1072 struct brw_bo *bo,
1073 uint32_t offset,
1074 int size)
1075 {
1076 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1077 int i;
1078
1079 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
1080 assert(devinfo->gen >= 7);
1081
1082 if (devinfo->gen >= 8) {
1083 BEGIN_BATCH(4 * size);
1084 for (i = 0; i < size; i++) {
1085 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
1086 OUT_BATCH(reg + i * 4);
1087 OUT_RELOC64(bo, 0, offset + i * 4);
1088 }
1089 ADVANCE_BATCH();
1090 } else {
1091 BEGIN_BATCH(3 * size);
1092 for (i = 0; i < size; i++) {
1093 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
1094 OUT_BATCH(reg + i * 4);
1095 OUT_RELOC(bo, 0, offset + i * 4);
1096 }
1097 ADVANCE_BATCH();
1098 }
1099 }
1100
1101 void
1102 brw_load_register_mem(struct brw_context *brw,
1103 uint32_t reg,
1104 struct brw_bo *bo,
1105 uint32_t offset)
1106 {
1107 load_sized_register_mem(brw, reg, bo, offset, 1);
1108 }
1109
1110 void
1111 brw_load_register_mem64(struct brw_context *brw,
1112 uint32_t reg,
1113 struct brw_bo *bo,
1114 uint32_t offset)
1115 {
1116 load_sized_register_mem(brw, reg, bo, offset, 2);
1117 }
1118
1119 /*
1120 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1121 */
1122 void
1123 brw_store_register_mem32(struct brw_context *brw,
1124 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1125 {
1126 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1127
1128 assert(devinfo->gen >= 6);
1129
1130 if (devinfo->gen >= 8) {
1131 BEGIN_BATCH(4);
1132 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1133 OUT_BATCH(reg);
1134 OUT_RELOC64(bo, RELOC_WRITE, offset);
1135 ADVANCE_BATCH();
1136 } else {
1137 BEGIN_BATCH(3);
1138 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1139 OUT_BATCH(reg);
1140 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1141 ADVANCE_BATCH();
1142 }
1143 }
1144
1145 /*
1146 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1147 */
1148 void
1149 brw_store_register_mem64(struct brw_context *brw,
1150 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1151 {
1152 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1153
1154 assert(devinfo->gen >= 6);
1155
1156 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1157 * read a full 64-bit register, we need to do two of them.
1158 */
1159 if (devinfo->gen >= 8) {
1160 BEGIN_BATCH(8);
1161 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1162 OUT_BATCH(reg);
1163 OUT_RELOC64(bo, RELOC_WRITE, offset);
1164 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1165 OUT_BATCH(reg + sizeof(uint32_t));
1166 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1167 ADVANCE_BATCH();
1168 } else {
1169 BEGIN_BATCH(6);
1170 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1171 OUT_BATCH(reg);
1172 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1173 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1174 OUT_BATCH(reg + sizeof(uint32_t));
1175 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1176 ADVANCE_BATCH();
1177 }
1178 }
1179
1180 /*
1181 * Write a 32-bit register using immediate data.
1182 */
1183 void
1184 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1185 {
1186 assert(brw->screen->devinfo.gen >= 6);
1187
1188 BEGIN_BATCH(3);
1189 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1190 OUT_BATCH(reg);
1191 OUT_BATCH(imm);
1192 ADVANCE_BATCH();
1193 }
1194
1195 /*
1196 * Write a 64-bit register using immediate data.
1197 */
1198 void
1199 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1200 {
1201 assert(brw->screen->devinfo.gen >= 6);
1202
1203 BEGIN_BATCH(5);
1204 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1205 OUT_BATCH(reg);
1206 OUT_BATCH(imm & 0xffffffff);
1207 OUT_BATCH(reg + 4);
1208 OUT_BATCH(imm >> 32);
1209 ADVANCE_BATCH();
1210 }
1211
1212 /*
1213 * Copies a 32-bit register.
1214 */
1215 void
1216 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
1217 {
1218 assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1219
1220 BEGIN_BATCH(3);
1221 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1222 OUT_BATCH(src);
1223 OUT_BATCH(dest);
1224 ADVANCE_BATCH();
1225 }
1226
1227 /*
1228 * Copies a 64-bit register.
1229 */
1230 void
1231 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1232 {
1233 assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1234
1235 BEGIN_BATCH(6);
1236 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1237 OUT_BATCH(src);
1238 OUT_BATCH(dest);
1239 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1240 OUT_BATCH(src + sizeof(uint32_t));
1241 OUT_BATCH(dest + sizeof(uint32_t));
1242 ADVANCE_BATCH();
1243 }
1244
1245 /*
1246 * Write 32-bits of immediate data to a GPU memory buffer.
1247 */
1248 void
1249 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1250 uint32_t offset, uint32_t imm)
1251 {
1252 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1253
1254 assert(devinfo->gen >= 6);
1255
1256 BEGIN_BATCH(4);
1257 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1258 if (devinfo->gen >= 8)
1259 OUT_RELOC64(bo, RELOC_WRITE, offset);
1260 else {
1261 OUT_BATCH(0); /* MBZ */
1262 OUT_RELOC(bo, RELOC_WRITE, offset);
1263 }
1264 OUT_BATCH(imm);
1265 ADVANCE_BATCH();
1266 }
1267
1268 /*
1269 * Write 64-bits of immediate data to a GPU memory buffer.
1270 */
1271 void
1272 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1273 uint32_t offset, uint64_t imm)
1274 {
1275 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1276
1277 assert(devinfo->gen >= 6);
1278
1279 BEGIN_BATCH(5);
1280 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1281 if (devinfo->gen >= 8)
1282 OUT_RELOC64(bo, RELOC_WRITE, offset);
1283 else {
1284 OUT_BATCH(0); /* MBZ */
1285 OUT_RELOC(bo, RELOC_WRITE, offset);
1286 }
1287 OUT_BATCH(imm & 0xffffffffu);
1288 OUT_BATCH(imm >> 32);
1289 ADVANCE_BATCH();
1290 }