2 * Copyright 2006 VMware, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
36 #include "util/hash_table.h"
41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
44 * Target sizes of the batch and state buffers. We create the initial
45 * buffers at these sizes, and flush when they're nearly full. If we
46 * underestimate how close we are to the end, and suddenly need more space
47 * in the middle of a draw, we can grow the buffers, and finish the draw.
48 * At that point, we'll be over our target size, so the next operation
49 * should flush. Each time we flush the batch, we recreate both buffers
50 * at the original target size, so it doesn't grow without bound.
52 #define BATCH_SZ (20 * 1024)
53 #define STATE_SZ (16 * 1024)
56 intel_batchbuffer_reset(struct brw_context
*brw
);
59 dump_validation_list(struct intel_batchbuffer
*batch
)
61 fprintf(stderr
, "Validation list (length %d):\n", batch
->exec_count
);
63 for (int i
= 0; i
< batch
->exec_count
; i
++) {
64 uint64_t flags
= batch
->validation_list
[i
].flags
;
65 assert(batch
->validation_list
[i
].handle
==
66 batch
->exec_bos
[i
]->gem_handle
);
67 fprintf(stderr
, "[%2d]: %2d %-14s %p %s%-7s @ 0x%016llx%s (%"PRIu64
"B)\n",
69 batch
->validation_list
[i
].handle
,
70 batch
->exec_bos
[i
]->name
,
72 (flags
& EXEC_OBJECT_SUPPORTS_48B_ADDRESS
) ? "(48b" : "(32b",
73 (flags
& EXEC_OBJECT_WRITE
) ? " write)" : ")",
74 batch
->validation_list
[i
].offset
,
75 (flags
& EXEC_OBJECT_PINNED
) ? " (pinned)" : "",
76 batch
->exec_bos
[i
]->size
);
81 uint_key_compare(const void *a
, const void *b
)
87 uint_key_hash(const void *key
)
89 return (uintptr_t) key
;
93 init_reloc_list(struct brw_reloc_list
*rlist
, int count
)
95 rlist
->reloc_count
= 0;
96 rlist
->reloc_array_size
= count
;
97 rlist
->relocs
= malloc(rlist
->reloc_array_size
*
98 sizeof(struct drm_i915_gem_relocation_entry
));
102 intel_batchbuffer_init(struct brw_context
*brw
)
104 struct intel_screen
*screen
= brw
->screen
;
105 struct intel_batchbuffer
*batch
= &brw
->batch
;
106 const struct gen_device_info
*devinfo
= &screen
->devinfo
;
108 batch
->use_shadow_copy
= !devinfo
->has_llc
;
110 if (batch
->use_shadow_copy
) {
111 batch
->batch
.map
= malloc(BATCH_SZ
);
112 batch
->map_next
= batch
->batch
.map
;
113 batch
->state
.map
= malloc(STATE_SZ
);
116 init_reloc_list(&batch
->batch_relocs
, 250);
117 init_reloc_list(&batch
->state_relocs
, 250);
119 batch
->exec_count
= 0;
120 batch
->exec_array_size
= 100;
122 malloc(batch
->exec_array_size
* sizeof(batch
->exec_bos
[0]));
123 batch
->validation_list
=
124 malloc(batch
->exec_array_size
* sizeof(batch
->validation_list
[0]));
126 if (INTEL_DEBUG
& DEBUG_BATCH
) {
127 batch
->state_batch_sizes
=
128 _mesa_hash_table_create(NULL
, uint_key_hash
, uint_key_compare
);
131 batch
->use_batch_first
=
132 screen
->kernel_features
& KERNEL_ALLOWS_EXEC_BATCH_FIRST
;
134 /* PIPE_CONTROL needs a w/a but only on gen6 */
135 batch
->valid_reloc_flags
= EXEC_OBJECT_WRITE
;
136 if (devinfo
->gen
== 6)
137 batch
->valid_reloc_flags
|= EXEC_OBJECT_NEEDS_GTT
;
139 intel_batchbuffer_reset(brw
);
142 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
145 add_exec_bo(struct intel_batchbuffer
*batch
, struct brw_bo
*bo
)
147 unsigned index
= READ_ONCE(bo
->index
);
149 if (index
< batch
->exec_count
&& batch
->exec_bos
[index
] == bo
)
152 /* May have been shared between multiple active batches */
153 for (index
= 0; index
< batch
->exec_count
; index
++) {
154 if (batch
->exec_bos
[index
] == bo
)
158 brw_bo_reference(bo
);
160 if (batch
->exec_count
== batch
->exec_array_size
) {
161 batch
->exec_array_size
*= 2;
163 realloc(batch
->exec_bos
,
164 batch
->exec_array_size
* sizeof(batch
->exec_bos
[0]));
165 batch
->validation_list
=
166 realloc(batch
->validation_list
,
167 batch
->exec_array_size
* sizeof(batch
->validation_list
[0]));
170 batch
->validation_list
[batch
->exec_count
] =
171 (struct drm_i915_gem_exec_object2
) {
172 .handle
= bo
->gem_handle
,
173 .offset
= bo
->gtt_offset
,
177 bo
->index
= batch
->exec_count
;
178 batch
->exec_bos
[batch
->exec_count
] = bo
;
179 batch
->aperture_space
+= bo
->size
;
181 return batch
->exec_count
++;
185 recreate_growing_buffer(struct brw_context
*brw
,
186 struct brw_growing_bo
*grow
,
187 const char *name
, unsigned size
)
189 struct intel_screen
*screen
= brw
->screen
;
190 struct intel_batchbuffer
*batch
= &brw
->batch
;
191 struct brw_bufmgr
*bufmgr
= screen
->bufmgr
;
193 grow
->bo
= brw_bo_alloc(bufmgr
, name
, size
);
194 grow
->bo
->kflags
= can_do_exec_capture(screen
) ? EXEC_OBJECT_CAPTURE
: 0;
195 grow
->partial_bo
= NULL
;
196 grow
->partial_bo_map
= NULL
;
197 grow
->partial_bytes
= 0;
199 if (!batch
->use_shadow_copy
)
200 grow
->map
= brw_bo_map(brw
, grow
->bo
, MAP_READ
| MAP_WRITE
);
204 intel_batchbuffer_reset(struct brw_context
*brw
)
206 struct intel_batchbuffer
*batch
= &brw
->batch
;
208 if (batch
->last_bo
!= NULL
) {
209 brw_bo_unreference(batch
->last_bo
);
210 batch
->last_bo
= NULL
;
212 batch
->last_bo
= batch
->batch
.bo
;
214 recreate_growing_buffer(brw
, &batch
->batch
, "batchbuffer", BATCH_SZ
);
215 batch
->map_next
= batch
->batch
.map
;
217 recreate_growing_buffer(brw
, &batch
->state
, "statebuffer", STATE_SZ
);
219 /* Avoid making 0 a valid state offset - otherwise the decoder will try
220 * and decode data when we use offset 0 as a null pointer.
222 batch
->state_used
= 1;
224 add_exec_bo(batch
, batch
->batch
.bo
);
225 assert(batch
->batch
.bo
->index
== 0);
227 batch
->needs_sol_reset
= false;
228 batch
->state_base_address_emitted
= false;
230 /* We don't know what ring the new batch will be sent to until we see the
231 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
233 batch
->ring
= UNKNOWN_RING
;
235 if (batch
->state_batch_sizes
)
236 _mesa_hash_table_clear(batch
->state_batch_sizes
, NULL
);
240 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context
*brw
)
242 intel_batchbuffer_reset(brw
);
243 brw_cache_sets_clear(brw
);
247 intel_batchbuffer_save_state(struct brw_context
*brw
)
249 brw
->batch
.saved
.map_next
= brw
->batch
.map_next
;
250 brw
->batch
.saved
.batch_reloc_count
= brw
->batch
.batch_relocs
.reloc_count
;
251 brw
->batch
.saved
.state_reloc_count
= brw
->batch
.state_relocs
.reloc_count
;
252 brw
->batch
.saved
.exec_count
= brw
->batch
.exec_count
;
256 intel_batchbuffer_reset_to_saved(struct brw_context
*brw
)
258 for (int i
= brw
->batch
.saved
.exec_count
;
259 i
< brw
->batch
.exec_count
; i
++) {
260 brw_bo_unreference(brw
->batch
.exec_bos
[i
]);
262 brw
->batch
.batch_relocs
.reloc_count
= brw
->batch
.saved
.batch_reloc_count
;
263 brw
->batch
.state_relocs
.reloc_count
= brw
->batch
.saved
.state_reloc_count
;
264 brw
->batch
.exec_count
= brw
->batch
.saved
.exec_count
;
266 brw
->batch
.map_next
= brw
->batch
.saved
.map_next
;
267 if (USED_BATCH(brw
->batch
) == 0)
268 brw
->batch
.ring
= UNKNOWN_RING
;
272 intel_batchbuffer_free(struct intel_batchbuffer
*batch
)
274 if (batch
->use_shadow_copy
) {
275 free(batch
->batch
.map
);
276 free(batch
->state
.map
);
279 for (int i
= 0; i
< batch
->exec_count
; i
++) {
280 brw_bo_unreference(batch
->exec_bos
[i
]);
282 free(batch
->batch_relocs
.relocs
);
283 free(batch
->state_relocs
.relocs
);
284 free(batch
->exec_bos
);
285 free(batch
->validation_list
);
287 brw_bo_unreference(batch
->last_bo
);
288 brw_bo_unreference(batch
->batch
.bo
);
289 brw_bo_unreference(batch
->state
.bo
);
290 if (batch
->state_batch_sizes
)
291 _mesa_hash_table_destroy(batch
->state_batch_sizes
, NULL
);
295 * Finish copying the old batch/state buffer's contents to the new one
296 * after we tried to "grow" the buffer in an earlier operation.
299 finish_growing_bos(struct brw_growing_bo
*grow
)
301 struct brw_bo
*old_bo
= grow
->partial_bo
;
305 memcpy(grow
->map
, grow
->partial_bo_map
, grow
->partial_bytes
);
307 grow
->partial_bo
= NULL
;
308 grow
->partial_bo_map
= NULL
;
309 grow
->partial_bytes
= 0;
311 brw_bo_unreference(old_bo
);
315 replace_bo_in_reloc_list(struct brw_reloc_list
*rlist
,
316 uint32_t old_handle
, uint32_t new_handle
)
318 for (int i
= 0; i
< rlist
->reloc_count
; i
++) {
319 if (rlist
->relocs
[i
].target_handle
== old_handle
)
320 rlist
->relocs
[i
].target_handle
= new_handle
;
325 * Grow either the batch or state buffer to a new larger size.
327 * We can't actually grow buffers, so we allocate a new one, copy over
328 * the existing contents, and update our lists to refer to the new one.
330 * Note that this is only temporary - each new batch recreates the buffers
331 * at their original target size (BATCH_SZ or STATE_SZ).
334 grow_buffer(struct brw_context
*brw
,
335 struct brw_growing_bo
*grow
,
336 unsigned existing_bytes
,
339 struct intel_batchbuffer
*batch
= &brw
->batch
;
340 struct brw_bufmgr
*bufmgr
= brw
->bufmgr
;
341 struct brw_bo
*bo
= grow
->bo
;
343 perf_debug("Growing %s - ran out of space\n", bo
->name
);
345 if (grow
->partial_bo
) {
346 /* We've already grown once, and now we need to do it again.
347 * Finish our last grow operation so we can start a new one.
348 * This should basically never happen.
350 perf_debug("Had to grow multiple times");
351 finish_growing_bos(grow
);
354 struct brw_bo
*new_bo
= brw_bo_alloc(bufmgr
, bo
->name
, new_size
);
356 /* Copy existing data to the new larger buffer */
357 grow
->partial_bo_map
= grow
->map
;
359 if (batch
->use_shadow_copy
) {
360 /* We can't safely use realloc, as it may move the existing buffer,
361 * breaking existing pointers the caller may still be using. Just
362 * malloc a new copy and memcpy it like the normal BO path.
364 grow
->map
= malloc(new_size
);
366 grow
->map
= brw_bo_map(brw
, new_bo
, MAP_READ
| MAP_WRITE
);
369 /* Try to put the new BO at the same GTT offset as the old BO (which
370 * we're throwing away, so it doesn't need to be there).
372 * This guarantees that our relocations continue to work: values we've
373 * already written into the buffer, values we're going to write into the
374 * buffer, and the validation/relocation lists all will match.
376 * Also preserve kflags for EXEC_OBJECT_CAPTURE.
378 new_bo
->gtt_offset
= bo
->gtt_offset
;
379 new_bo
->index
= bo
->index
;
380 new_bo
->kflags
= bo
->kflags
;
382 /* Batch/state buffers are per-context, and if we've run out of space,
383 * we must have actually used them before, so...they will be in the list.
385 assert(bo
->index
< batch
->exec_count
);
386 assert(batch
->exec_bos
[bo
->index
] == bo
);
388 /* Update the validation list to use the new BO. */
389 batch
->validation_list
[bo
->index
].handle
= new_bo
->gem_handle
;
391 if (!batch
->use_batch_first
) {
392 /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
393 * update the relocation list entries to point at the new BO as well.
394 * (With newer kernels, the "handle" is an offset into the validation
395 * list, which remains unchanged, so we can skip this.)
397 replace_bo_in_reloc_list(&batch
->batch_relocs
,
398 bo
->gem_handle
, new_bo
->gem_handle
);
399 replace_bo_in_reloc_list(&batch
->state_relocs
,
400 bo
->gem_handle
, new_bo
->gem_handle
);
403 /* Exchange the two BOs...without breaking pointers to the old BO.
405 * Consider this scenario:
407 * 1. Somebody calls brw_state_batch() to get a region of memory, and
408 * and then creates a brw_address pointing to brw->batch.state.bo.
409 * 2. They then call brw_state_batch() a second time, which happens to
410 * grow and replace the state buffer. They then try to emit a
411 * relocation to their first section of memory.
413 * If we replace the brw->batch.state.bo pointer at step 2, we would
414 * break the address created in step 1. They'd have a pointer to the
415 * old destroyed BO. Emitting a relocation would add this dead BO to
416 * the validation list...causing /both/ statebuffers to be in the list,
417 * and all kinds of disasters.
419 * This is not a contrived case - BLORP vertex data upload hits this.
421 * There are worse scenarios too. Fences for GL sync objects reference
422 * brw->batch.batch.bo. If we replaced the batch pointer when growing,
423 * we'd need to chase down every fence and update it to point to the
424 * new BO. Otherwise, it would refer to a "batch" that never actually
425 * gets submitted, and would fail to trigger.
427 * To work around both of these issues, we transmutate the buffers in
428 * place, making the existing struct brw_bo represent the new buffer,
429 * and "new_bo" represent the old BO. This is highly unusual, but it
430 * seems like a necessary evil.
432 * We also defer the memcpy of the existing batch's contents. Callers
433 * may make multiple brw_state_batch calls, and retain pointers to the
434 * old BO's map. We'll perform the memcpy in finish_growing_bo() when
435 * we finally submit the batch, at which point we've finished uploading
436 * state, and nobody should have any old references anymore.
438 * To do that, we keep a reference to the old BO in grow->partial_bo,
439 * and store the number of bytes to copy in grow->partial_bytes. We
440 * can monkey with the refcounts directly without atomics because these
441 * are per-context BOs and they can only be touched by this thread.
443 assert(new_bo
->refcount
== 1);
444 new_bo
->refcount
= bo
->refcount
;
448 memcpy(&tmp
, bo
, sizeof(struct brw_bo
));
449 memcpy(bo
, new_bo
, sizeof(struct brw_bo
));
450 memcpy(new_bo
, &tmp
, sizeof(struct brw_bo
));
452 grow
->partial_bo
= new_bo
; /* the one reference of the OLD bo */
453 grow
->partial_bytes
= existing_bytes
;
457 intel_batchbuffer_require_space(struct brw_context
*brw
, GLuint sz
,
458 enum brw_gpu_ring ring
)
460 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
461 struct intel_batchbuffer
*batch
= &brw
->batch
;
463 /* If we're switching rings, implicitly flush the batch. */
464 if (unlikely(ring
!= brw
->batch
.ring
) && brw
->batch
.ring
!= UNKNOWN_RING
&&
466 intel_batchbuffer_flush(brw
);
469 const unsigned batch_used
= USED_BATCH(*batch
) * 4;
470 if (batch_used
+ sz
>= BATCH_SZ
&& !batch
->no_wrap
) {
471 intel_batchbuffer_flush(brw
);
472 } else if (batch_used
+ sz
>= batch
->batch
.bo
->size
) {
473 const unsigned new_size
=
474 MIN2(batch
->batch
.bo
->size
+ batch
->batch
.bo
->size
/ 2,
476 grow_buffer(brw
, &batch
->batch
, batch_used
, new_size
);
477 batch
->map_next
= (void *) batch
->batch
.map
+ batch_used
;
478 assert(batch_used
+ sz
< batch
->batch
.bo
->size
);
481 /* The intel_batchbuffer_flush() calls above might have changed
482 * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
484 brw
->batch
.ring
= ring
;
489 #define BLUE_HEADER CSI "0;44m"
490 #define NORMAL CSI "0m"
494 decode_struct(struct brw_context
*brw
, struct gen_spec
*spec
,
495 const char *struct_name
, uint32_t *data
,
496 uint32_t gtt_offset
, uint32_t offset
, bool color
)
498 struct gen_group
*group
= gen_spec_find_struct(spec
, struct_name
);
502 fprintf(stderr
, "%s\n", struct_name
);
503 gen_print_group(stderr
, group
, gtt_offset
+ offset
,
504 &data
[offset
/ 4], 0, color
);
508 decode_structs(struct brw_context
*brw
, struct gen_spec
*spec
,
509 const char *struct_name
,
510 uint32_t *data
, uint32_t gtt_offset
, uint32_t offset
,
511 int struct_size
, bool color
)
513 struct gen_group
*group
= gen_spec_find_struct(spec
, struct_name
);
517 int entries
= brw_state_batch_size(brw
, offset
) / struct_size
;
518 for (int i
= 0; i
< entries
; i
++) {
519 fprintf(stderr
, "%s %d\n", struct_name
, i
);
520 gen_print_group(stderr
, group
, gtt_offset
+ offset
,
521 &data
[(offset
+ i
* struct_size
) / 4], 0, color
);
526 do_batch_dump(struct brw_context
*brw
)
528 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
529 struct intel_batchbuffer
*batch
= &brw
->batch
;
530 struct gen_spec
*spec
= gen_spec_load(&brw
->screen
->devinfo
);
532 if (batch
->ring
!= RENDER_RING
)
535 uint32_t *batch_data
= brw_bo_map(brw
, batch
->batch
.bo
, MAP_READ
);
536 uint32_t *state
= brw_bo_map(brw
, batch
->state
.bo
, MAP_READ
);
537 if (batch_data
== NULL
|| state
== NULL
) {
538 fprintf(stderr
, "WARNING: failed to map batchbuffer/statebuffer\n");
542 uint32_t *end
= batch_data
+ USED_BATCH(*batch
);
543 uint32_t batch_gtt_offset
= batch
->batch
.bo
->gtt_offset
;
544 uint32_t state_gtt_offset
= batch
->state
.bo
->gtt_offset
;
547 bool color
= INTEL_DEBUG
& DEBUG_COLOR
;
548 const char *header_color
= color
? BLUE_HEADER
: "";
549 const char *reset_color
= color
? NORMAL
: "";
551 for (uint32_t *p
= batch_data
; p
< end
; p
+= length
) {
552 struct gen_group
*inst
= gen_spec_find_instruction(spec
, p
);
553 length
= gen_group_get_length(inst
, p
);
554 assert(inst
== NULL
|| length
> 0);
555 length
= MAX2(1, length
);
557 fprintf(stderr
, "unknown instruction %08x\n", p
[0]);
561 uint64_t offset
= batch_gtt_offset
+ 4 * (p
- batch_data
);
563 fprintf(stderr
, "%s0x%08"PRIx64
": 0x%08x: %-80s%s\n", header_color
,
564 offset
, p
[0], gen_group_get_name(inst
), reset_color
);
566 gen_print_group(stderr
, inst
, offset
, p
, 0, color
);
568 switch (gen_group_get_opcode(inst
) >> 16) {
569 case _3DSTATE_PIPELINED_POINTERS
:
570 /* Note: these Gen4-5 pointers are full relocations rather than
571 * offsets from the start of the statebuffer. So we need to subtract
572 * gtt_offset (the start of the statebuffer) to obtain an offset we
573 * can add to the map and get at the data.
575 decode_struct(brw
, spec
, "VS_STATE", state
, state_gtt_offset
,
576 (p
[1] & ~0x1fu
) - state_gtt_offset
, color
);
578 decode_struct(brw
, spec
, "GS_STATE", state
, state_gtt_offset
,
579 (p
[2] & ~0x1fu
) - state_gtt_offset
, color
);
582 decode_struct(brw
, spec
, "CLIP_STATE", state
, state_gtt_offset
,
583 (p
[3] & ~0x1fu
) - state_gtt_offset
, color
);
585 decode_struct(brw
, spec
, "SF_STATE", state
, state_gtt_offset
,
586 (p
[4] & ~0x1fu
) - state_gtt_offset
, color
);
587 decode_struct(brw
, spec
, "WM_STATE", state
, state_gtt_offset
,
588 (p
[5] & ~0x1fu
) - state_gtt_offset
, color
);
589 decode_struct(brw
, spec
, "COLOR_CALC_STATE", state
, state_gtt_offset
,
590 (p
[6] & ~0x3fu
) - state_gtt_offset
, color
);
592 case _3DSTATE_BINDING_TABLE_POINTERS_VS
:
593 case _3DSTATE_BINDING_TABLE_POINTERS_HS
:
594 case _3DSTATE_BINDING_TABLE_POINTERS_DS
:
595 case _3DSTATE_BINDING_TABLE_POINTERS_GS
:
596 case _3DSTATE_BINDING_TABLE_POINTERS_PS
: {
597 struct gen_group
*group
=
598 gen_spec_find_struct(spec
, "RENDER_SURFACE_STATE");
602 uint32_t bt_offset
= p
[1] & ~0x1fu
;
603 int bt_entries
= brw_state_batch_size(brw
, bt_offset
) / 4;
604 uint32_t *bt_pointers
= &state
[bt_offset
/ 4];
605 for (int i
= 0; i
< bt_entries
; i
++) {
606 fprintf(stderr
, "SURFACE_STATE - BTI = %d\n", i
);
607 gen_print_group(stderr
, group
, state_gtt_offset
+ bt_pointers
[i
],
608 &state
[bt_pointers
[i
] / 4], 0, color
);
612 case _3DSTATE_SAMPLER_STATE_POINTERS_VS
:
613 case _3DSTATE_SAMPLER_STATE_POINTERS_HS
:
614 case _3DSTATE_SAMPLER_STATE_POINTERS_DS
:
615 case _3DSTATE_SAMPLER_STATE_POINTERS_GS
:
616 case _3DSTATE_SAMPLER_STATE_POINTERS_PS
:
617 decode_structs(brw
, spec
, "SAMPLER_STATE", state
,
618 state_gtt_offset
, p
[1] & ~0x1fu
, 4 * 4, color
);
620 case _3DSTATE_VIEWPORT_STATE_POINTERS
:
621 decode_structs(brw
, spec
, "CLIP_VIEWPORT", state
,
622 state_gtt_offset
, p
[1] & ~0x3fu
, 4 * 4, color
);
623 decode_structs(brw
, spec
, "SF_VIEWPORT", state
,
624 state_gtt_offset
, p
[1] & ~0x3fu
, 8 * 4, color
);
625 decode_structs(brw
, spec
, "CC_VIEWPORT", state
,
626 state_gtt_offset
, p
[3] & ~0x3fu
, 2 * 4, color
);
628 case _3DSTATE_VIEWPORT_STATE_POINTERS_CC
:
629 decode_structs(brw
, spec
, "CC_VIEWPORT", state
,
630 state_gtt_offset
, p
[1] & ~0x3fu
, 2 * 4, color
);
632 case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL
:
633 decode_structs(brw
, spec
, "SF_CLIP_VIEWPORT", state
,
634 state_gtt_offset
, p
[1] & ~0x3fu
, 16 * 4, color
);
636 case _3DSTATE_SCISSOR_STATE_POINTERS
:
637 decode_structs(brw
, spec
, "SCISSOR_RECT", state
,
638 state_gtt_offset
, p
[1] & ~0x1fu
, 2 * 4, color
);
640 case _3DSTATE_BLEND_STATE_POINTERS
:
641 /* TODO: handle Gen8+ extra dword at the beginning */
642 decode_structs(brw
, spec
, "BLEND_STATE", state
,
643 state_gtt_offset
, p
[1] & ~0x3fu
, 8 * 4, color
);
645 case _3DSTATE_CC_STATE_POINTERS
:
646 if (devinfo
->gen
>= 7) {
647 decode_struct(brw
, spec
, "COLOR_CALC_STATE", state
,
648 state_gtt_offset
, p
[1] & ~0x3fu
, color
);
649 } else if (devinfo
->gen
== 6) {
650 decode_structs(brw
, spec
, "BLEND_STATE", state
,
651 state_gtt_offset
, p
[1] & ~0x3fu
, 2 * 4, color
);
652 decode_struct(brw
, spec
, "DEPTH_STENCIL_STATE", state
,
653 state_gtt_offset
, p
[2] & ~0x3fu
, color
);
654 decode_struct(brw
, spec
, "COLOR_CALC_STATE", state
,
655 state_gtt_offset
, p
[3] & ~0x3fu
, color
);
658 case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS
:
659 decode_struct(brw
, spec
, "DEPTH_STENCIL_STATE", state
,
660 state_gtt_offset
, p
[1] & ~0x3fu
, color
);
662 case MEDIA_INTERFACE_DESCRIPTOR_LOAD
: {
663 struct gen_group
*group
=
664 gen_spec_find_struct(spec
, "RENDER_SURFACE_STATE");
668 uint32_t idd_offset
= p
[3] & ~0x1fu
;
669 decode_struct(brw
, spec
, "INTERFACE_DESCRIPTOR_DATA", state
,
670 state_gtt_offset
, idd_offset
, color
);
672 uint32_t ss_offset
= state
[idd_offset
/ 4 + 3] & ~0x1fu
;
673 decode_structs(brw
, spec
, "SAMPLER_STATE", state
,
674 state_gtt_offset
, ss_offset
, 4 * 4, color
);
676 uint32_t bt_offset
= state
[idd_offset
/ 4 + 4] & ~0x1fu
;
677 int bt_entries
= brw_state_batch_size(brw
, bt_offset
) / 4;
678 uint32_t *bt_pointers
= &state
[bt_offset
/ 4];
679 for (int i
= 0; i
< bt_entries
; i
++) {
680 fprintf(stderr
, "SURFACE_STATE - BTI = %d\n", i
);
681 gen_print_group(stderr
, group
, state_gtt_offset
+ bt_pointers
[i
],
682 &state
[bt_pointers
[i
] / 4], 0, color
);
689 brw_bo_unmap(batch
->batch
.bo
);
690 brw_bo_unmap(batch
->state
.bo
);
693 static void do_batch_dump(struct brw_context
*brw
) { }
697 * Called when starting a new batch buffer.
700 brw_new_batch(struct brw_context
*brw
)
702 /* Unreference any BOs held by the previous batch, and reset counts. */
703 for (int i
= 0; i
< brw
->batch
.exec_count
; i
++) {
704 brw_bo_unreference(brw
->batch
.exec_bos
[i
]);
705 brw
->batch
.exec_bos
[i
] = NULL
;
707 brw
->batch
.batch_relocs
.reloc_count
= 0;
708 brw
->batch
.state_relocs
.reloc_count
= 0;
709 brw
->batch
.exec_count
= 0;
710 brw
->batch
.aperture_space
= 0;
712 brw_bo_unreference(brw
->batch
.state
.bo
);
714 /* Create a new batchbuffer and reset the associated state: */
715 intel_batchbuffer_reset_and_clear_render_cache(brw
);
717 /* If the kernel supports hardware contexts, then most hardware state is
718 * preserved between batches; we only need to re-emit state that is required
719 * to be in every batch. Otherwise we need to re-emit all the state that
720 * would otherwise be stored in the context (which for all intents and
721 * purposes means everything).
723 if (brw
->hw_ctx
== 0) {
724 brw
->ctx
.NewDriverState
|= BRW_NEW_CONTEXT
;
725 brw_upload_invariant_state(brw
);
728 brw
->ctx
.NewDriverState
|= BRW_NEW_BATCH
;
730 brw
->ib
.index_size
= -1;
732 /* We need to periodically reap the shader time results, because rollover
733 * happens every few seconds. We also want to see results every once in a
734 * while, because many programs won't cleanly destroy our context, so the
735 * end-of-run printout may not happen.
737 if (INTEL_DEBUG
& DEBUG_SHADER_TIME
)
738 brw_collect_and_report_shader_time(brw
);
742 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
745 * This function can emit state (say, to preserve registers that aren't saved
749 brw_finish_batch(struct brw_context
*brw
)
751 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
753 brw
->batch
.no_wrap
= true;
755 /* Capture the closing pipeline statistics register values necessary to
756 * support query objects (in the non-hardware context world).
758 brw_emit_query_end(brw
);
760 if (brw
->batch
.ring
== RENDER_RING
) {
761 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
762 * assume that the L3 cache is configured according to the hardware
763 * defaults. On Kernel 4.16+, we no longer need to do this.
765 if (devinfo
->gen
>= 7 &&
766 !(brw
->screen
->kernel_features
& KERNEL_ALLOWS_CONTEXT_ISOLATION
))
767 gen7_restore_default_l3_config(brw
);
769 if (devinfo
->is_haswell
) {
770 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
771 * 3DSTATE_CC_STATE_POINTERS > "Note":
773 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
774 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
776 * From the example in the docs, it seems to expect a regular pipe control
777 * flush here as well. We may have done it already, but meh.
779 * See also WaAvoidRCZCounterRollover.
781 brw_emit_mi_flush(brw
);
783 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS
<< 16 | (2 - 2));
784 OUT_BATCH(brw
->cc
.state_offset
| 1);
786 brw_emit_pipe_control_flush(brw
, PIPE_CONTROL_RENDER_TARGET_FLUSH
|
787 PIPE_CONTROL_CS_STALL
);
790 /* Do not restore push constant packets during context restore. */
791 if (devinfo
->gen
>= 7)
792 gen10_emit_isp_disable(brw
);
795 /* Emit MI_BATCH_BUFFER_END to finish our batch. Note that execbuf2
796 * requires our batch size to be QWord aligned, so we pad it out if
797 * necessary by emitting an extra MI_NOOP after the end.
799 intel_batchbuffer_require_space(brw
, 8, brw
->batch
.ring
);
800 *brw
->batch
.map_next
++ = MI_BATCH_BUFFER_END
;
801 if (USED_BATCH(brw
->batch
) & 1) {
802 *brw
->batch
.map_next
++ = MI_NOOP
;
805 brw
->batch
.no_wrap
= false;
809 throttle(struct brw_context
*brw
)
811 /* Wait for the swapbuffers before the one we just emitted, so we
812 * don't get too many swaps outstanding for apps that are GPU-heavy
815 * We're using intelDRI2Flush (called from the loader before
816 * swapbuffer) and glFlush (for front buffer rendering) as the
817 * indicator that a frame is done and then throttle when we get
818 * here as we prepare to render the next frame. At this point for
819 * round trips for swap/copy and getting new buffers are done and
820 * we'll spend less time waiting on the GPU.
822 * Unfortunately, we don't have a handle to the batch containing
823 * the swap, and getting our hands on that doesn't seem worth it,
824 * so we just use the first batch we emitted after the last swap.
826 if (brw
->need_swap_throttle
&& brw
->throttle_batch
[0]) {
827 if (brw
->throttle_batch
[1]) {
828 if (!brw
->disable_throttling
) {
829 brw_bo_wait_rendering(brw
->throttle_batch
[1]);
831 brw_bo_unreference(brw
->throttle_batch
[1]);
833 brw
->throttle_batch
[1] = brw
->throttle_batch
[0];
834 brw
->throttle_batch
[0] = NULL
;
835 brw
->need_swap_throttle
= false;
836 /* Throttling here is more precise than the throttle ioctl, so skip it */
837 brw
->need_flush_throttle
= false;
840 if (brw
->need_flush_throttle
) {
841 __DRIscreen
*dri_screen
= brw
->screen
->driScrnPriv
;
842 drmCommandNone(dri_screen
->fd
, DRM_I915_GEM_THROTTLE
);
843 brw
->need_flush_throttle
= false;
849 struct intel_batchbuffer
*batch
,
856 struct drm_i915_gem_execbuffer2 execbuf
= {
857 .buffers_ptr
= (uintptr_t) batch
->validation_list
,
858 .buffer_count
= batch
->exec_count
,
859 .batch_start_offset
= 0,
862 .rsvd1
= ctx_id
, /* rsvd1 is actually the context ID */
865 unsigned long cmd
= DRM_IOCTL_I915_GEM_EXECBUFFER2
;
867 if (in_fence
!= -1) {
868 execbuf
.rsvd2
= in_fence
;
869 execbuf
.flags
|= I915_EXEC_FENCE_IN
;
872 if (out_fence
!= NULL
) {
873 cmd
= DRM_IOCTL_I915_GEM_EXECBUFFER2_WR
;
875 execbuf
.flags
|= I915_EXEC_FENCE_OUT
;
878 int ret
= drmIoctl(fd
, cmd
, &execbuf
);
882 for (int i
= 0; i
< batch
->exec_count
; i
++) {
883 struct brw_bo
*bo
= batch
->exec_bos
[i
];
888 /* Update brw_bo::gtt_offset */
889 if (batch
->validation_list
[i
].offset
!= bo
->gtt_offset
) {
890 DBG("BO %d migrated: 0x%" PRIx64
" -> 0x%llx\n",
891 bo
->gem_handle
, bo
->gtt_offset
,
892 batch
->validation_list
[i
].offset
);
893 bo
->gtt_offset
= batch
->validation_list
[i
].offset
;
897 if (ret
== 0 && out_fence
!= NULL
)
898 *out_fence
= execbuf
.rsvd2
>> 32;
904 submit_batch(struct brw_context
*brw
, int in_fence_fd
, int *out_fence_fd
)
906 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
907 __DRIscreen
*dri_screen
= brw
->screen
->driScrnPriv
;
908 struct intel_batchbuffer
*batch
= &brw
->batch
;
911 if (batch
->use_shadow_copy
) {
912 void *bo_map
= brw_bo_map(brw
, batch
->batch
.bo
, MAP_WRITE
);
913 memcpy(bo_map
, batch
->batch
.map
, 4 * USED_BATCH(*batch
));
915 bo_map
= brw_bo_map(brw
, batch
->state
.bo
, MAP_WRITE
);
916 memcpy(bo_map
, batch
->state
.map
, batch
->state_used
);
919 brw_bo_unmap(batch
->batch
.bo
);
920 brw_bo_unmap(batch
->state
.bo
);
922 if (!brw
->screen
->no_hw
) {
923 /* The requirement for using I915_EXEC_NO_RELOC are:
925 * The addresses written in the objects must match the corresponding
926 * reloc.gtt_offset which in turn must match the corresponding
929 * Any render targets written to in the batch must be flagged with
932 * To avoid stalling, execobject.offset should match the current
933 * address of that object within the active context.
935 int flags
= I915_EXEC_NO_RELOC
;
937 if (devinfo
->gen
>= 6 && batch
->ring
== BLT_RING
) {
938 flags
|= I915_EXEC_BLT
;
940 flags
|= I915_EXEC_RENDER
;
942 if (batch
->needs_sol_reset
)
943 flags
|= I915_EXEC_GEN7_SOL_RESET
;
945 uint32_t hw_ctx
= batch
->ring
== RENDER_RING
? brw
->hw_ctx
: 0;
947 /* Set statebuffer relocations */
948 const unsigned state_index
= batch
->state
.bo
->index
;
949 if (state_index
< batch
->exec_count
&&
950 batch
->exec_bos
[state_index
] == batch
->state
.bo
) {
951 struct drm_i915_gem_exec_object2
*entry
=
952 &batch
->validation_list
[state_index
];
953 assert(entry
->handle
== batch
->state
.bo
->gem_handle
);
954 entry
->relocation_count
= batch
->state_relocs
.reloc_count
;
955 entry
->relocs_ptr
= (uintptr_t) batch
->state_relocs
.relocs
;
958 /* Set batchbuffer relocations */
959 struct drm_i915_gem_exec_object2
*entry
= &batch
->validation_list
[0];
960 assert(entry
->handle
== batch
->batch
.bo
->gem_handle
);
961 entry
->relocation_count
= batch
->batch_relocs
.reloc_count
;
962 entry
->relocs_ptr
= (uintptr_t) batch
->batch_relocs
.relocs
;
964 if (batch
->use_batch_first
) {
965 flags
|= I915_EXEC_BATCH_FIRST
| I915_EXEC_HANDLE_LUT
;
967 /* Move the batch to the end of the validation list */
968 struct drm_i915_gem_exec_object2 tmp
;
969 const unsigned index
= batch
->exec_count
- 1;
972 *entry
= batch
->validation_list
[index
];
973 batch
->validation_list
[index
] = tmp
;
976 ret
= execbuffer(dri_screen
->fd
, batch
, hw_ctx
,
977 4 * USED_BATCH(*batch
),
978 in_fence_fd
, out_fence_fd
, flags
);
983 if (unlikely(INTEL_DEBUG
& DEBUG_BATCH
))
986 if (brw
->ctx
.Const
.ResetStrategy
== GL_LOSE_CONTEXT_ON_RESET_ARB
)
987 brw_check_for_reset(brw
);
990 fprintf(stderr
, "i965: Failed to submit batchbuffer: %s\n",
999 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
1002 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
1003 * of the returned fd.
1006 _intel_batchbuffer_flush_fence(struct brw_context
*brw
,
1007 int in_fence_fd
, int *out_fence_fd
,
1008 const char *file
, int line
)
1012 if (USED_BATCH(brw
->batch
) == 0)
1015 /* Check that we didn't just wrap our batchbuffer at a bad time. */
1016 assert(!brw
->batch
.no_wrap
);
1018 brw_finish_batch(brw
);
1019 brw_upload_finish(&brw
->upload
);
1021 finish_growing_bos(&brw
->batch
.batch
);
1022 finish_growing_bos(&brw
->batch
.state
);
1024 if (brw
->throttle_batch
[0] == NULL
) {
1025 brw
->throttle_batch
[0] = brw
->batch
.batch
.bo
;
1026 brw_bo_reference(brw
->throttle_batch
[0]);
1029 if (unlikely(INTEL_DEBUG
& (DEBUG_BATCH
| DEBUG_SUBMIT
))) {
1030 int bytes_for_commands
= 4 * USED_BATCH(brw
->batch
);
1031 int bytes_for_state
= brw
->batch
.state_used
;
1032 fprintf(stderr
, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
1033 " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
1034 " %4d batch relocs, %4d state relocs\n", file
, line
,
1035 bytes_for_commands
, 100.0f
* bytes_for_commands
/ BATCH_SZ
,
1036 bytes_for_state
, 100.0f
* bytes_for_state
/ STATE_SZ
,
1037 brw
->batch
.exec_count
,
1038 (float) brw
->batch
.aperture_space
/ (1024 * 1024),
1039 brw
->batch
.batch_relocs
.reloc_count
,
1040 brw
->batch
.state_relocs
.reloc_count
);
1043 ret
= submit_batch(brw
, in_fence_fd
, out_fence_fd
);
1045 if (unlikely(INTEL_DEBUG
& DEBUG_SYNC
)) {
1046 fprintf(stderr
, "waiting for idle\n");
1047 brw_bo_wait_rendering(brw
->batch
.batch
.bo
);
1050 /* Start a new batch buffer. */
1057 brw_batch_has_aperture_space(struct brw_context
*brw
, unsigned extra_space
)
1059 return brw
->batch
.aperture_space
+ extra_space
<=
1060 brw
->screen
->aperture_threshold
;
1064 brw_batch_references(struct intel_batchbuffer
*batch
, struct brw_bo
*bo
)
1066 unsigned index
= READ_ONCE(bo
->index
);
1067 if (index
< batch
->exec_count
&& batch
->exec_bos
[index
] == bo
)
1070 for (int i
= 0; i
< batch
->exec_count
; i
++) {
1071 if (batch
->exec_bos
[i
] == bo
)
1077 /* This is the only way buffers get added to the validate list.
1080 emit_reloc(struct intel_batchbuffer
*batch
,
1081 struct brw_reloc_list
*rlist
, uint32_t offset
,
1082 struct brw_bo
*target
, int32_t target_offset
,
1083 unsigned int reloc_flags
)
1085 assert(target
!= NULL
);
1087 if (rlist
->reloc_count
== rlist
->reloc_array_size
) {
1088 rlist
->reloc_array_size
*= 2;
1089 rlist
->relocs
= realloc(rlist
->relocs
,
1090 rlist
->reloc_array_size
*
1091 sizeof(struct drm_i915_gem_relocation_entry
));
1094 unsigned int index
= add_exec_bo(batch
, target
);
1095 struct drm_i915_gem_exec_object2
*entry
= &batch
->validation_list
[index
];
1097 if (reloc_flags
& RELOC_32BIT
) {
1098 /* Restrict this buffer to the low 32 bits of the address space.
1100 * Altering the validation list flags restricts it for this batch,
1101 * but we also alter the BO's kflags to restrict it permanently
1102 * (until the BO is destroyed and put back in the cache). Buffers
1103 * may stay bound across batches, and we want keep it constrained.
1105 target
->kflags
&= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
1106 entry
->flags
&= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
1108 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
1109 reloc_flags
&= ~RELOC_32BIT
;
1113 entry
->flags
|= reloc_flags
& batch
->valid_reloc_flags
;
1115 rlist
->relocs
[rlist
->reloc_count
++] =
1116 (struct drm_i915_gem_relocation_entry
) {
1118 .delta
= target_offset
,
1119 .target_handle
= batch
->use_batch_first
? index
: target
->gem_handle
,
1120 .presumed_offset
= entry
->offset
,
1123 /* Using the old buffer offset, write in what the right data would be, in
1124 * case the buffer doesn't move and we can short-circuit the relocation
1125 * processing in the kernel
1127 return entry
->offset
+ target_offset
;
1131 brw_batch_reloc(struct intel_batchbuffer
*batch
, uint32_t batch_offset
,
1132 struct brw_bo
*target
, uint32_t target_offset
,
1133 unsigned int reloc_flags
)
1135 assert(batch_offset
<= batch
->batch
.bo
->size
- sizeof(uint32_t));
1137 return emit_reloc(batch
, &batch
->batch_relocs
, batch_offset
,
1138 target
, target_offset
, reloc_flags
);
1142 brw_state_reloc(struct intel_batchbuffer
*batch
, uint32_t state_offset
,
1143 struct brw_bo
*target
, uint32_t target_offset
,
1144 unsigned int reloc_flags
)
1146 assert(state_offset
<= batch
->state
.bo
->size
- sizeof(uint32_t));
1148 return emit_reloc(batch
, &batch
->state_relocs
, state_offset
,
1149 target
, target_offset
, reloc_flags
);
1154 brw_state_batch_size(struct brw_context
*brw
, uint32_t offset
)
1156 struct hash_entry
*entry
=
1157 _mesa_hash_table_search(brw
->batch
.state_batch_sizes
,
1158 (void *) (uintptr_t) offset
);
1159 return entry
? (uintptr_t) entry
->data
: 0;
1163 * Reserve some space in the statebuffer, or flush.
1165 * This is used to estimate when we're near the end of the batch,
1166 * so we can flush early.
1169 brw_require_statebuffer_space(struct brw_context
*brw
, int size
)
1171 if (brw
->batch
.state_used
+ size
>= STATE_SZ
)
1172 intel_batchbuffer_flush(brw
);
1176 * Allocates a block of space in the batchbuffer for indirect state.
1179 brw_state_batch(struct brw_context
*brw
,
1182 uint32_t *out_offset
)
1184 struct intel_batchbuffer
*batch
= &brw
->batch
;
1186 assert(size
< batch
->state
.bo
->size
);
1188 uint32_t offset
= ALIGN(batch
->state_used
, alignment
);
1190 if (offset
+ size
>= STATE_SZ
&& !batch
->no_wrap
) {
1191 intel_batchbuffer_flush(brw
);
1192 offset
= ALIGN(batch
->state_used
, alignment
);
1193 } else if (offset
+ size
>= batch
->state
.bo
->size
) {
1194 const unsigned new_size
=
1195 MIN2(batch
->state
.bo
->size
+ batch
->state
.bo
->size
/ 2,
1197 grow_buffer(brw
, &batch
->state
, batch
->state_used
, new_size
);
1198 assert(offset
+ size
< batch
->state
.bo
->size
);
1201 if (unlikely(INTEL_DEBUG
& DEBUG_BATCH
)) {
1202 _mesa_hash_table_insert(batch
->state_batch_sizes
,
1203 (void *) (uintptr_t) offset
,
1204 (void *) (uintptr_t) size
);
1207 batch
->state_used
= offset
+ size
;
1209 *out_offset
= offset
;
1210 return batch
->state
.map
+ (offset
>> 2);
1214 intel_batchbuffer_data(struct brw_context
*brw
,
1215 const void *data
, GLuint bytes
, enum brw_gpu_ring ring
)
1217 assert((bytes
& 3) == 0);
1218 intel_batchbuffer_require_space(brw
, bytes
, ring
);
1219 memcpy(brw
->batch
.map_next
, data
, bytes
);
1220 brw
->batch
.map_next
+= bytes
>> 2;
1224 load_sized_register_mem(struct brw_context
*brw
,
1230 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1233 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
1234 assert(devinfo
->gen
>= 7);
1236 if (devinfo
->gen
>= 8) {
1237 BEGIN_BATCH(4 * size
);
1238 for (i
= 0; i
< size
; i
++) {
1239 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM
| (4 - 2));
1240 OUT_BATCH(reg
+ i
* 4);
1241 OUT_RELOC64(bo
, 0, offset
+ i
* 4);
1245 BEGIN_BATCH(3 * size
);
1246 for (i
= 0; i
< size
; i
++) {
1247 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM
| (3 - 2));
1248 OUT_BATCH(reg
+ i
* 4);
1249 OUT_RELOC(bo
, 0, offset
+ i
* 4);
1256 brw_load_register_mem(struct brw_context
*brw
,
1261 load_sized_register_mem(brw
, reg
, bo
, offset
, 1);
1265 brw_load_register_mem64(struct brw_context
*brw
,
1270 load_sized_register_mem(brw
, reg
, bo
, offset
, 2);
1274 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1277 brw_store_register_mem32(struct brw_context
*brw
,
1278 struct brw_bo
*bo
, uint32_t reg
, uint32_t offset
)
1280 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1282 assert(devinfo
->gen
>= 6);
1284 if (devinfo
->gen
>= 8) {
1286 OUT_BATCH(MI_STORE_REGISTER_MEM
| (4 - 2));
1288 OUT_RELOC64(bo
, RELOC_WRITE
, offset
);
1292 OUT_BATCH(MI_STORE_REGISTER_MEM
| (3 - 2));
1294 OUT_RELOC(bo
, RELOC_WRITE
| RELOC_NEEDS_GGTT
, offset
);
1300 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1303 brw_store_register_mem64(struct brw_context
*brw
,
1304 struct brw_bo
*bo
, uint32_t reg
, uint32_t offset
)
1306 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1308 assert(devinfo
->gen
>= 6);
1310 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1311 * read a full 64-bit register, we need to do two of them.
1313 if (devinfo
->gen
>= 8) {
1315 OUT_BATCH(MI_STORE_REGISTER_MEM
| (4 - 2));
1317 OUT_RELOC64(bo
, RELOC_WRITE
, offset
);
1318 OUT_BATCH(MI_STORE_REGISTER_MEM
| (4 - 2));
1319 OUT_BATCH(reg
+ sizeof(uint32_t));
1320 OUT_RELOC64(bo
, RELOC_WRITE
, offset
+ sizeof(uint32_t));
1324 OUT_BATCH(MI_STORE_REGISTER_MEM
| (3 - 2));
1326 OUT_RELOC(bo
, RELOC_WRITE
| RELOC_NEEDS_GGTT
, offset
);
1327 OUT_BATCH(MI_STORE_REGISTER_MEM
| (3 - 2));
1328 OUT_BATCH(reg
+ sizeof(uint32_t));
1329 OUT_RELOC(bo
, RELOC_WRITE
| RELOC_NEEDS_GGTT
, offset
+ sizeof(uint32_t));
1335 * Write a 32-bit register using immediate data.
1338 brw_load_register_imm32(struct brw_context
*brw
, uint32_t reg
, uint32_t imm
)
1340 assert(brw
->screen
->devinfo
.gen
>= 6);
1343 OUT_BATCH(MI_LOAD_REGISTER_IMM
| (3 - 2));
1350 * Write a 64-bit register using immediate data.
1353 brw_load_register_imm64(struct brw_context
*brw
, uint32_t reg
, uint64_t imm
)
1355 assert(brw
->screen
->devinfo
.gen
>= 6);
1358 OUT_BATCH(MI_LOAD_REGISTER_IMM
| (5 - 2));
1360 OUT_BATCH(imm
& 0xffffffff);
1362 OUT_BATCH(imm
>> 32);
1367 * Copies a 32-bit register.
1370 brw_load_register_reg(struct brw_context
*brw
, uint32_t src
, uint32_t dest
)
1372 assert(brw
->screen
->devinfo
.gen
>= 8 || brw
->screen
->devinfo
.is_haswell
);
1375 OUT_BATCH(MI_LOAD_REGISTER_REG
| (3 - 2));
1382 * Copies a 64-bit register.
1385 brw_load_register_reg64(struct brw_context
*brw
, uint32_t src
, uint32_t dest
)
1387 assert(brw
->screen
->devinfo
.gen
>= 8 || brw
->screen
->devinfo
.is_haswell
);
1390 OUT_BATCH(MI_LOAD_REGISTER_REG
| (3 - 2));
1393 OUT_BATCH(MI_LOAD_REGISTER_REG
| (3 - 2));
1394 OUT_BATCH(src
+ sizeof(uint32_t));
1395 OUT_BATCH(dest
+ sizeof(uint32_t));
1400 * Write 32-bits of immediate data to a GPU memory buffer.
1403 brw_store_data_imm32(struct brw_context
*brw
, struct brw_bo
*bo
,
1404 uint32_t offset
, uint32_t imm
)
1406 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1408 assert(devinfo
->gen
>= 6);
1411 OUT_BATCH(MI_STORE_DATA_IMM
| (4 - 2));
1412 if (devinfo
->gen
>= 8)
1413 OUT_RELOC64(bo
, RELOC_WRITE
, offset
);
1415 OUT_BATCH(0); /* MBZ */
1416 OUT_RELOC(bo
, RELOC_WRITE
, offset
);
1423 * Write 64-bits of immediate data to a GPU memory buffer.
1426 brw_store_data_imm64(struct brw_context
*brw
, struct brw_bo
*bo
,
1427 uint32_t offset
, uint64_t imm
)
1429 const struct gen_device_info
*devinfo
= &brw
->screen
->devinfo
;
1431 assert(devinfo
->gen
>= 6);
1434 OUT_BATCH(MI_STORE_DATA_IMM
| (5 - 2));
1435 if (devinfo
->gen
>= 8)
1436 OUT_RELOC64(bo
, RELOC_WRITE
, offset
);
1438 OUT_BATCH(0); /* MBZ */
1439 OUT_RELOC(bo
, RELOC_WRITE
, offset
);
1441 OUT_BATCH(imm
& 0xffffffffu
);
1442 OUT_BATCH(imm
>> 32);