i965: Use %x instead of %u in debug print.
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35
36 #include "util/hash_table.h"
37
38 #include <xf86drm.h>
39 #include <i915_drm.h>
40
41 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
42
43 /**
44 * Target sizes of the batch and state buffers. We create the initial
45 * buffers at these sizes, and flush when they're nearly full. If we
46 * underestimate how close we are to the end, and suddenly need more space
47 * in the middle of a draw, we can grow the buffers, and finish the draw.
48 * At that point, we'll be over our target size, so the next operation
49 * should flush. Each time we flush the batch, we recreate both buffers
50 * at the original target size, so it doesn't grow without bound.
51 */
52 #define BATCH_SZ (20 * 1024)
53 #define STATE_SZ (16 * 1024)
54
55 static void
56 intel_batchbuffer_reset(struct brw_context *brw);
57
58 UNUSED static void
59 dump_validation_list(struct intel_batchbuffer *batch)
60 {
61 fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
62
63 for (int i = 0; i < batch->exec_count; i++) {
64 uint64_t flags = batch->validation_list[i].flags;
65 assert(batch->validation_list[i].handle ==
66 batch->exec_bos[i]->gem_handle);
67 fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%016llx%s (%"PRIu64"B)\n",
68 i,
69 batch->validation_list[i].handle,
70 batch->exec_bos[i]->name,
71 batch->exec_bos[i],
72 (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b",
73 (flags & EXEC_OBJECT_WRITE) ? " write)" : ")",
74 batch->validation_list[i].offset,
75 (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "",
76 batch->exec_bos[i]->size);
77 }
78 }
79
80 static bool
81 uint_key_compare(const void *a, const void *b)
82 {
83 return a == b;
84 }
85
86 static uint32_t
87 uint_key_hash(const void *key)
88 {
89 return (uintptr_t) key;
90 }
91
92 static void
93 init_reloc_list(struct brw_reloc_list *rlist, int count)
94 {
95 rlist->reloc_count = 0;
96 rlist->reloc_array_size = count;
97 rlist->relocs = malloc(rlist->reloc_array_size *
98 sizeof(struct drm_i915_gem_relocation_entry));
99 }
100
101 void
102 intel_batchbuffer_init(struct brw_context *brw)
103 {
104 struct intel_screen *screen = brw->screen;
105 struct intel_batchbuffer *batch = &brw->batch;
106 const struct gen_device_info *devinfo = &screen->devinfo;
107
108 batch->use_shadow_copy = !devinfo->has_llc;
109
110 if (batch->use_shadow_copy) {
111 batch->batch.map = malloc(BATCH_SZ);
112 batch->map_next = batch->batch.map;
113 batch->state.map = malloc(STATE_SZ);
114 }
115
116 init_reloc_list(&batch->batch_relocs, 250);
117 init_reloc_list(&batch->state_relocs, 250);
118
119 batch->exec_count = 0;
120 batch->exec_array_size = 100;
121 batch->exec_bos =
122 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
123 batch->validation_list =
124 malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
125
126 if (INTEL_DEBUG & DEBUG_BATCH) {
127 batch->state_batch_sizes =
128 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
129 }
130
131 batch->use_batch_first =
132 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
133
134 /* PIPE_CONTROL needs a w/a but only on gen6 */
135 batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
136 if (devinfo->gen == 6)
137 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
138
139 intel_batchbuffer_reset(brw);
140 }
141
142 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
143
144 static unsigned
145 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
146 {
147 unsigned index = READ_ONCE(bo->index);
148
149 if (index < batch->exec_count && batch->exec_bos[index] == bo)
150 return index;
151
152 /* May have been shared between multiple active batches */
153 for (index = 0; index < batch->exec_count; index++) {
154 if (batch->exec_bos[index] == bo)
155 return index;
156 }
157
158 brw_bo_reference(bo);
159
160 if (batch->exec_count == batch->exec_array_size) {
161 batch->exec_array_size *= 2;
162 batch->exec_bos =
163 realloc(batch->exec_bos,
164 batch->exec_array_size * sizeof(batch->exec_bos[0]));
165 batch->validation_list =
166 realloc(batch->validation_list,
167 batch->exec_array_size * sizeof(batch->validation_list[0]));
168 }
169
170 batch->validation_list[batch->exec_count] =
171 (struct drm_i915_gem_exec_object2) {
172 .handle = bo->gem_handle,
173 .offset = bo->gtt_offset,
174 .flags = bo->kflags,
175 };
176
177 bo->index = batch->exec_count;
178 batch->exec_bos[batch->exec_count] = bo;
179 batch->aperture_space += bo->size;
180
181 return batch->exec_count++;
182 }
183
184 static void
185 recreate_growing_buffer(struct brw_context *brw,
186 struct brw_growing_bo *grow,
187 const char *name, unsigned size)
188 {
189 struct intel_screen *screen = brw->screen;
190 struct intel_batchbuffer *batch = &brw->batch;
191 struct brw_bufmgr *bufmgr = screen->bufmgr;
192
193 grow->bo = brw_bo_alloc(bufmgr, name, size);
194 grow->bo->kflags = can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
195 grow->partial_bo = NULL;
196 grow->partial_bo_map = NULL;
197 grow->partial_bytes = 0;
198
199 if (!batch->use_shadow_copy)
200 grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
201 }
202
203 static void
204 intel_batchbuffer_reset(struct brw_context *brw)
205 {
206 struct intel_batchbuffer *batch = &brw->batch;
207
208 if (batch->last_bo != NULL) {
209 brw_bo_unreference(batch->last_bo);
210 batch->last_bo = NULL;
211 }
212 batch->last_bo = batch->batch.bo;
213
214 recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ);
215 batch->map_next = batch->batch.map;
216
217 recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ);
218
219 /* Avoid making 0 a valid state offset - otherwise the decoder will try
220 * and decode data when we use offset 0 as a null pointer.
221 */
222 batch->state_used = 1;
223
224 add_exec_bo(batch, batch->batch.bo);
225 assert(batch->batch.bo->index == 0);
226
227 batch->needs_sol_reset = false;
228 batch->state_base_address_emitted = false;
229
230 /* We don't know what ring the new batch will be sent to until we see the
231 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
232 */
233 batch->ring = UNKNOWN_RING;
234
235 if (batch->state_batch_sizes)
236 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
237 }
238
239 static void
240 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
241 {
242 intel_batchbuffer_reset(brw);
243 brw_cache_sets_clear(brw);
244 }
245
246 void
247 intel_batchbuffer_save_state(struct brw_context *brw)
248 {
249 brw->batch.saved.map_next = brw->batch.map_next;
250 brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
251 brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
252 brw->batch.saved.exec_count = brw->batch.exec_count;
253 }
254
255 void
256 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
257 {
258 for (int i = brw->batch.saved.exec_count;
259 i < brw->batch.exec_count; i++) {
260 brw_bo_unreference(brw->batch.exec_bos[i]);
261 }
262 brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
263 brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
264 brw->batch.exec_count = brw->batch.saved.exec_count;
265
266 brw->batch.map_next = brw->batch.saved.map_next;
267 if (USED_BATCH(brw->batch) == 0)
268 brw->batch.ring = UNKNOWN_RING;
269 }
270
271 void
272 intel_batchbuffer_free(struct intel_batchbuffer *batch)
273 {
274 if (batch->use_shadow_copy) {
275 free(batch->batch.map);
276 free(batch->state.map);
277 }
278
279 for (int i = 0; i < batch->exec_count; i++) {
280 brw_bo_unreference(batch->exec_bos[i]);
281 }
282 free(batch->batch_relocs.relocs);
283 free(batch->state_relocs.relocs);
284 free(batch->exec_bos);
285 free(batch->validation_list);
286
287 brw_bo_unreference(batch->last_bo);
288 brw_bo_unreference(batch->batch.bo);
289 brw_bo_unreference(batch->state.bo);
290 if (batch->state_batch_sizes)
291 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
292 }
293
294 /**
295 * Finish copying the old batch/state buffer's contents to the new one
296 * after we tried to "grow" the buffer in an earlier operation.
297 */
298 static void
299 finish_growing_bos(struct brw_growing_bo *grow)
300 {
301 struct brw_bo *old_bo = grow->partial_bo;
302 if (!old_bo)
303 return;
304
305 memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
306
307 grow->partial_bo = NULL;
308 grow->partial_bo_map = NULL;
309 grow->partial_bytes = 0;
310
311 brw_bo_unreference(old_bo);
312 }
313
314 static void
315 replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
316 uint32_t old_handle, uint32_t new_handle)
317 {
318 for (int i = 0; i < rlist->reloc_count; i++) {
319 if (rlist->relocs[i].target_handle == old_handle)
320 rlist->relocs[i].target_handle = new_handle;
321 }
322 }
323
324 /**
325 * Grow either the batch or state buffer to a new larger size.
326 *
327 * We can't actually grow buffers, so we allocate a new one, copy over
328 * the existing contents, and update our lists to refer to the new one.
329 *
330 * Note that this is only temporary - each new batch recreates the buffers
331 * at their original target size (BATCH_SZ or STATE_SZ).
332 */
333 static void
334 grow_buffer(struct brw_context *brw,
335 struct brw_growing_bo *grow,
336 unsigned existing_bytes,
337 unsigned new_size)
338 {
339 struct intel_batchbuffer *batch = &brw->batch;
340 struct brw_bufmgr *bufmgr = brw->bufmgr;
341 struct brw_bo *bo = grow->bo;
342
343 perf_debug("Growing %s - ran out of space\n", bo->name);
344
345 if (grow->partial_bo) {
346 /* We've already grown once, and now we need to do it again.
347 * Finish our last grow operation so we can start a new one.
348 * This should basically never happen.
349 */
350 perf_debug("Had to grow multiple times");
351 finish_growing_bos(grow);
352 }
353
354 struct brw_bo *new_bo = brw_bo_alloc(bufmgr, bo->name, new_size);
355
356 /* Copy existing data to the new larger buffer */
357 grow->partial_bo_map = grow->map;
358
359 if (batch->use_shadow_copy) {
360 /* We can't safely use realloc, as it may move the existing buffer,
361 * breaking existing pointers the caller may still be using. Just
362 * malloc a new copy and memcpy it like the normal BO path.
363 */
364 grow->map = malloc(new_size);
365 } else {
366 grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
367 }
368
369 /* Try to put the new BO at the same GTT offset as the old BO (which
370 * we're throwing away, so it doesn't need to be there).
371 *
372 * This guarantees that our relocations continue to work: values we've
373 * already written into the buffer, values we're going to write into the
374 * buffer, and the validation/relocation lists all will match.
375 *
376 * Also preserve kflags for EXEC_OBJECT_CAPTURE.
377 */
378 new_bo->gtt_offset = bo->gtt_offset;
379 new_bo->index = bo->index;
380 new_bo->kflags = bo->kflags;
381
382 /* Batch/state buffers are per-context, and if we've run out of space,
383 * we must have actually used them before, so...they will be in the list.
384 */
385 assert(bo->index < batch->exec_count);
386 assert(batch->exec_bos[bo->index] == bo);
387
388 /* Update the validation list to use the new BO. */
389 batch->validation_list[bo->index].handle = new_bo->gem_handle;
390
391 if (!batch->use_batch_first) {
392 /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
393 * update the relocation list entries to point at the new BO as well.
394 * (With newer kernels, the "handle" is an offset into the validation
395 * list, which remains unchanged, so we can skip this.)
396 */
397 replace_bo_in_reloc_list(&batch->batch_relocs,
398 bo->gem_handle, new_bo->gem_handle);
399 replace_bo_in_reloc_list(&batch->state_relocs,
400 bo->gem_handle, new_bo->gem_handle);
401 }
402
403 /* Exchange the two BOs...without breaking pointers to the old BO.
404 *
405 * Consider this scenario:
406 *
407 * 1. Somebody calls brw_state_batch() to get a region of memory, and
408 * and then creates a brw_address pointing to brw->batch.state.bo.
409 * 2. They then call brw_state_batch() a second time, which happens to
410 * grow and replace the state buffer. They then try to emit a
411 * relocation to their first section of memory.
412 *
413 * If we replace the brw->batch.state.bo pointer at step 2, we would
414 * break the address created in step 1. They'd have a pointer to the
415 * old destroyed BO. Emitting a relocation would add this dead BO to
416 * the validation list...causing /both/ statebuffers to be in the list,
417 * and all kinds of disasters.
418 *
419 * This is not a contrived case - BLORP vertex data upload hits this.
420 *
421 * There are worse scenarios too. Fences for GL sync objects reference
422 * brw->batch.batch.bo. If we replaced the batch pointer when growing,
423 * we'd need to chase down every fence and update it to point to the
424 * new BO. Otherwise, it would refer to a "batch" that never actually
425 * gets submitted, and would fail to trigger.
426 *
427 * To work around both of these issues, we transmutate the buffers in
428 * place, making the existing struct brw_bo represent the new buffer,
429 * and "new_bo" represent the old BO. This is highly unusual, but it
430 * seems like a necessary evil.
431 *
432 * We also defer the memcpy of the existing batch's contents. Callers
433 * may make multiple brw_state_batch calls, and retain pointers to the
434 * old BO's map. We'll perform the memcpy in finish_growing_bo() when
435 * we finally submit the batch, at which point we've finished uploading
436 * state, and nobody should have any old references anymore.
437 *
438 * To do that, we keep a reference to the old BO in grow->partial_bo,
439 * and store the number of bytes to copy in grow->partial_bytes. We
440 * can monkey with the refcounts directly without atomics because these
441 * are per-context BOs and they can only be touched by this thread.
442 */
443 assert(new_bo->refcount == 1);
444 new_bo->refcount = bo->refcount;
445 bo->refcount = 1;
446
447 struct brw_bo tmp;
448 memcpy(&tmp, bo, sizeof(struct brw_bo));
449 memcpy(bo, new_bo, sizeof(struct brw_bo));
450 memcpy(new_bo, &tmp, sizeof(struct brw_bo));
451
452 grow->partial_bo = new_bo; /* the one reference of the OLD bo */
453 grow->partial_bytes = existing_bytes;
454 }
455
456 void
457 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
458 enum brw_gpu_ring ring)
459 {
460 const struct gen_device_info *devinfo = &brw->screen->devinfo;
461 struct intel_batchbuffer *batch = &brw->batch;
462
463 /* If we're switching rings, implicitly flush the batch. */
464 if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
465 devinfo->gen >= 6) {
466 intel_batchbuffer_flush(brw);
467 }
468
469 const unsigned batch_used = USED_BATCH(*batch) * 4;
470 if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
471 intel_batchbuffer_flush(brw);
472 } else if (batch_used + sz >= batch->batch.bo->size) {
473 const unsigned new_size =
474 MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
475 MAX_BATCH_SIZE);
476 grow_buffer(brw, &batch->batch, batch_used, new_size);
477 batch->map_next = (void *) batch->batch.map + batch_used;
478 assert(batch_used + sz < batch->batch.bo->size);
479 }
480
481 /* The intel_batchbuffer_flush() calls above might have changed
482 * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
483 */
484 brw->batch.ring = ring;
485 }
486
487 #ifdef DEBUG
488 #define CSI "\e["
489 #define BLUE_HEADER CSI "0;44m"
490 #define NORMAL CSI "0m"
491
492
493 static void
494 decode_struct(struct brw_context *brw, struct gen_spec *spec,
495 const char *struct_name, uint32_t *data,
496 uint32_t gtt_offset, uint32_t offset, bool color)
497 {
498 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
499 if (!group)
500 return;
501
502 fprintf(stderr, "%s\n", struct_name);
503 gen_print_group(stderr, group, gtt_offset + offset,
504 &data[offset / 4], 0, color);
505 }
506
507 static void
508 decode_structs(struct brw_context *brw, struct gen_spec *spec,
509 const char *struct_name,
510 uint32_t *data, uint32_t gtt_offset, uint32_t offset,
511 int struct_size, bool color)
512 {
513 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
514 if (!group)
515 return;
516
517 int entries = brw_state_batch_size(brw, offset) / struct_size;
518 for (int i = 0; i < entries; i++) {
519 fprintf(stderr, "%s %d\n", struct_name, i);
520 gen_print_group(stderr, group, gtt_offset + offset,
521 &data[(offset + i * struct_size) / 4], 0, color);
522 }
523 }
524
525 static void
526 do_batch_dump(struct brw_context *brw)
527 {
528 const struct gen_device_info *devinfo = &brw->screen->devinfo;
529 struct intel_batchbuffer *batch = &brw->batch;
530 struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
531
532 if (batch->ring != RENDER_RING)
533 return;
534
535 uint32_t *batch_data = brw_bo_map(brw, batch->batch.bo, MAP_READ);
536 uint32_t *state = brw_bo_map(brw, batch->state.bo, MAP_READ);
537 if (batch_data == NULL || state == NULL) {
538 fprintf(stderr, "WARNING: failed to map batchbuffer/statebuffer\n");
539 return;
540 }
541
542 uint32_t *end = batch_data + USED_BATCH(*batch);
543 uint32_t batch_gtt_offset = batch->batch.bo->gtt_offset;
544 uint32_t state_gtt_offset = batch->state.bo->gtt_offset;
545 int length;
546
547 bool color = INTEL_DEBUG & DEBUG_COLOR;
548 const char *header_color = color ? BLUE_HEADER : "";
549 const char *reset_color = color ? NORMAL : "";
550
551 for (uint32_t *p = batch_data; p < end; p += length) {
552 struct gen_group *inst = gen_spec_find_instruction(spec, p);
553 length = gen_group_get_length(inst, p);
554 assert(inst == NULL || length > 0);
555 length = MAX2(1, length);
556 if (inst == NULL) {
557 fprintf(stderr, "unknown instruction %08x\n", p[0]);
558 continue;
559 }
560
561 uint64_t offset = batch_gtt_offset + 4 * (p - batch_data);
562
563 fprintf(stderr, "%s0x%08"PRIx64": 0x%08x: %-80s%s\n", header_color,
564 offset, p[0], gen_group_get_name(inst), reset_color);
565
566 gen_print_group(stderr, inst, offset, p, 0, color);
567
568 switch (gen_group_get_opcode(inst) >> 16) {
569 case _3DSTATE_PIPELINED_POINTERS:
570 /* Note: these Gen4-5 pointers are full relocations rather than
571 * offsets from the start of the statebuffer. So we need to subtract
572 * gtt_offset (the start of the statebuffer) to obtain an offset we
573 * can add to the map and get at the data.
574 */
575 decode_struct(brw, spec, "VS_STATE", state, state_gtt_offset,
576 (p[1] & ~0x1fu) - state_gtt_offset, color);
577 if (p[2] & 1) {
578 decode_struct(brw, spec, "GS_STATE", state, state_gtt_offset,
579 (p[2] & ~0x1fu) - state_gtt_offset, color);
580 }
581 if (p[3] & 1) {
582 decode_struct(brw, spec, "CLIP_STATE", state, state_gtt_offset,
583 (p[3] & ~0x1fu) - state_gtt_offset, color);
584 }
585 decode_struct(brw, spec, "SF_STATE", state, state_gtt_offset,
586 (p[4] & ~0x1fu) - state_gtt_offset, color);
587 decode_struct(brw, spec, "WM_STATE", state, state_gtt_offset,
588 (p[5] & ~0x1fu) - state_gtt_offset, color);
589 decode_struct(brw, spec, "COLOR_CALC_STATE", state, state_gtt_offset,
590 (p[6] & ~0x3fu) - state_gtt_offset, color);
591 break;
592 case _3DSTATE_BINDING_TABLE_POINTERS_VS:
593 case _3DSTATE_BINDING_TABLE_POINTERS_HS:
594 case _3DSTATE_BINDING_TABLE_POINTERS_DS:
595 case _3DSTATE_BINDING_TABLE_POINTERS_GS:
596 case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
597 struct gen_group *group =
598 gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
599 if (!group)
600 break;
601
602 uint32_t bt_offset = p[1] & ~0x1fu;
603 int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
604 uint32_t *bt_pointers = &state[bt_offset / 4];
605 for (int i = 0; i < bt_entries; i++) {
606 fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
607 gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
608 &state[bt_pointers[i] / 4], 0, color);
609 }
610 break;
611 }
612 case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
613 case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
614 case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
615 case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
616 case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
617 decode_structs(brw, spec, "SAMPLER_STATE", state,
618 state_gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
619 break;
620 case _3DSTATE_VIEWPORT_STATE_POINTERS:
621 decode_structs(brw, spec, "CLIP_VIEWPORT", state,
622 state_gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
623 decode_structs(brw, spec, "SF_VIEWPORT", state,
624 state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
625 decode_structs(brw, spec, "CC_VIEWPORT", state,
626 state_gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
627 break;
628 case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
629 decode_structs(brw, spec, "CC_VIEWPORT", state,
630 state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
631 break;
632 case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
633 decode_structs(brw, spec, "SF_CLIP_VIEWPORT", state,
634 state_gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
635 break;
636 case _3DSTATE_SCISSOR_STATE_POINTERS:
637 decode_structs(brw, spec, "SCISSOR_RECT", state,
638 state_gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
639 break;
640 case _3DSTATE_BLEND_STATE_POINTERS:
641 /* TODO: handle Gen8+ extra dword at the beginning */
642 decode_structs(brw, spec, "BLEND_STATE", state,
643 state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
644 break;
645 case _3DSTATE_CC_STATE_POINTERS:
646 if (devinfo->gen >= 7) {
647 decode_struct(brw, spec, "COLOR_CALC_STATE", state,
648 state_gtt_offset, p[1] & ~0x3fu, color);
649 } else if (devinfo->gen == 6) {
650 decode_structs(brw, spec, "BLEND_STATE", state,
651 state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
652 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
653 state_gtt_offset, p[2] & ~0x3fu, color);
654 decode_struct(brw, spec, "COLOR_CALC_STATE", state,
655 state_gtt_offset, p[3] & ~0x3fu, color);
656 }
657 break;
658 case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
659 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
660 state_gtt_offset, p[1] & ~0x3fu, color);
661 break;
662 case MEDIA_INTERFACE_DESCRIPTOR_LOAD: {
663 struct gen_group *group =
664 gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
665 if (!group)
666 break;
667
668 uint32_t idd_offset = p[3] & ~0x1fu;
669 decode_struct(brw, spec, "INTERFACE_DESCRIPTOR_DATA", state,
670 state_gtt_offset, idd_offset, color);
671
672 uint32_t ss_offset = state[idd_offset / 4 + 3] & ~0x1fu;
673 decode_structs(brw, spec, "SAMPLER_STATE", state,
674 state_gtt_offset, ss_offset, 4 * 4, color);
675
676 uint32_t bt_offset = state[idd_offset / 4 + 4] & ~0x1fu;
677 int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
678 uint32_t *bt_pointers = &state[bt_offset / 4];
679 for (int i = 0; i < bt_entries; i++) {
680 fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
681 gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
682 &state[bt_pointers[i] / 4], 0, color);
683 }
684 break;
685 }
686 }
687 }
688
689 brw_bo_unmap(batch->batch.bo);
690 brw_bo_unmap(batch->state.bo);
691 }
692 #else
693 static void do_batch_dump(struct brw_context *brw) { }
694 #endif
695
696 /**
697 * Called when starting a new batch buffer.
698 */
699 static void
700 brw_new_batch(struct brw_context *brw)
701 {
702 /* Unreference any BOs held by the previous batch, and reset counts. */
703 for (int i = 0; i < brw->batch.exec_count; i++) {
704 brw_bo_unreference(brw->batch.exec_bos[i]);
705 brw->batch.exec_bos[i] = NULL;
706 }
707 brw->batch.batch_relocs.reloc_count = 0;
708 brw->batch.state_relocs.reloc_count = 0;
709 brw->batch.exec_count = 0;
710 brw->batch.aperture_space = 0;
711
712 brw_bo_unreference(brw->batch.state.bo);
713
714 /* Create a new batchbuffer and reset the associated state: */
715 intel_batchbuffer_reset_and_clear_render_cache(brw);
716
717 /* If the kernel supports hardware contexts, then most hardware state is
718 * preserved between batches; we only need to re-emit state that is required
719 * to be in every batch. Otherwise we need to re-emit all the state that
720 * would otherwise be stored in the context (which for all intents and
721 * purposes means everything).
722 */
723 if (brw->hw_ctx == 0) {
724 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
725 brw_upload_invariant_state(brw);
726 }
727
728 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
729
730 brw->ib.index_size = -1;
731
732 /* We need to periodically reap the shader time results, because rollover
733 * happens every few seconds. We also want to see results every once in a
734 * while, because many programs won't cleanly destroy our context, so the
735 * end-of-run printout may not happen.
736 */
737 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
738 brw_collect_and_report_shader_time(brw);
739 }
740
741 /**
742 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
743 * sending it off.
744 *
745 * This function can emit state (say, to preserve registers that aren't saved
746 * between batches).
747 */
748 static void
749 brw_finish_batch(struct brw_context *brw)
750 {
751 const struct gen_device_info *devinfo = &brw->screen->devinfo;
752
753 brw->batch.no_wrap = true;
754
755 /* Capture the closing pipeline statistics register values necessary to
756 * support query objects (in the non-hardware context world).
757 */
758 brw_emit_query_end(brw);
759
760 if (brw->batch.ring == RENDER_RING) {
761 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
762 * assume that the L3 cache is configured according to the hardware
763 * defaults. On Kernel 4.16+, we no longer need to do this.
764 */
765 if (devinfo->gen >= 7 &&
766 !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
767 gen7_restore_default_l3_config(brw);
768
769 if (devinfo->is_haswell) {
770 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
771 * 3DSTATE_CC_STATE_POINTERS > "Note":
772 *
773 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
774 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
775 *
776 * From the example in the docs, it seems to expect a regular pipe control
777 * flush here as well. We may have done it already, but meh.
778 *
779 * See also WaAvoidRCZCounterRollover.
780 */
781 brw_emit_mi_flush(brw);
782 BEGIN_BATCH(2);
783 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
784 OUT_BATCH(brw->cc.state_offset | 1);
785 ADVANCE_BATCH();
786 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
787 PIPE_CONTROL_CS_STALL);
788 }
789
790 /* Do not restore push constant packets during context restore. */
791 if (devinfo->gen >= 7)
792 gen10_emit_isp_disable(brw);
793 }
794
795 /* Emit MI_BATCH_BUFFER_END to finish our batch. Note that execbuf2
796 * requires our batch size to be QWord aligned, so we pad it out if
797 * necessary by emitting an extra MI_NOOP after the end.
798 */
799 intel_batchbuffer_require_space(brw, 8, brw->batch.ring);
800 *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
801 if (USED_BATCH(brw->batch) & 1) {
802 *brw->batch.map_next++ = MI_NOOP;
803 }
804
805 brw->batch.no_wrap = false;
806 }
807
808 static void
809 throttle(struct brw_context *brw)
810 {
811 /* Wait for the swapbuffers before the one we just emitted, so we
812 * don't get too many swaps outstanding for apps that are GPU-heavy
813 * but not CPU-heavy.
814 *
815 * We're using intelDRI2Flush (called from the loader before
816 * swapbuffer) and glFlush (for front buffer rendering) as the
817 * indicator that a frame is done and then throttle when we get
818 * here as we prepare to render the next frame. At this point for
819 * round trips for swap/copy and getting new buffers are done and
820 * we'll spend less time waiting on the GPU.
821 *
822 * Unfortunately, we don't have a handle to the batch containing
823 * the swap, and getting our hands on that doesn't seem worth it,
824 * so we just use the first batch we emitted after the last swap.
825 */
826 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
827 if (brw->throttle_batch[1]) {
828 if (!brw->disable_throttling) {
829 brw_bo_wait_rendering(brw->throttle_batch[1]);
830 }
831 brw_bo_unreference(brw->throttle_batch[1]);
832 }
833 brw->throttle_batch[1] = brw->throttle_batch[0];
834 brw->throttle_batch[0] = NULL;
835 brw->need_swap_throttle = false;
836 /* Throttling here is more precise than the throttle ioctl, so skip it */
837 brw->need_flush_throttle = false;
838 }
839
840 if (brw->need_flush_throttle) {
841 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
842 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
843 brw->need_flush_throttle = false;
844 }
845 }
846
847 static int
848 execbuffer(int fd,
849 struct intel_batchbuffer *batch,
850 uint32_t ctx_id,
851 int used,
852 int in_fence,
853 int *out_fence,
854 int flags)
855 {
856 struct drm_i915_gem_execbuffer2 execbuf = {
857 .buffers_ptr = (uintptr_t) batch->validation_list,
858 .buffer_count = batch->exec_count,
859 .batch_start_offset = 0,
860 .batch_len = used,
861 .flags = flags,
862 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
863 };
864
865 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
866
867 if (in_fence != -1) {
868 execbuf.rsvd2 = in_fence;
869 execbuf.flags |= I915_EXEC_FENCE_IN;
870 }
871
872 if (out_fence != NULL) {
873 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
874 *out_fence = -1;
875 execbuf.flags |= I915_EXEC_FENCE_OUT;
876 }
877
878 int ret = drmIoctl(fd, cmd, &execbuf);
879 if (ret != 0)
880 ret = -errno;
881
882 for (int i = 0; i < batch->exec_count; i++) {
883 struct brw_bo *bo = batch->exec_bos[i];
884
885 bo->idle = false;
886 bo->index = -1;
887
888 /* Update brw_bo::gtt_offset */
889 if (batch->validation_list[i].offset != bo->gtt_offset) {
890 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
891 bo->gem_handle, bo->gtt_offset,
892 batch->validation_list[i].offset);
893 bo->gtt_offset = batch->validation_list[i].offset;
894 }
895 }
896
897 if (ret == 0 && out_fence != NULL)
898 *out_fence = execbuf.rsvd2 >> 32;
899
900 return ret;
901 }
902
903 static int
904 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
905 {
906 const struct gen_device_info *devinfo = &brw->screen->devinfo;
907 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
908 struct intel_batchbuffer *batch = &brw->batch;
909 int ret = 0;
910
911 if (batch->use_shadow_copy) {
912 void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
913 memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
914
915 bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
916 memcpy(bo_map, batch->state.map, batch->state_used);
917 }
918
919 brw_bo_unmap(batch->batch.bo);
920 brw_bo_unmap(batch->state.bo);
921
922 if (!brw->screen->no_hw) {
923 /* The requirement for using I915_EXEC_NO_RELOC are:
924 *
925 * The addresses written in the objects must match the corresponding
926 * reloc.gtt_offset which in turn must match the corresponding
927 * execobject.offset.
928 *
929 * Any render targets written to in the batch must be flagged with
930 * EXEC_OBJECT_WRITE.
931 *
932 * To avoid stalling, execobject.offset should match the current
933 * address of that object within the active context.
934 */
935 int flags = I915_EXEC_NO_RELOC;
936
937 if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
938 flags |= I915_EXEC_BLT;
939 } else {
940 flags |= I915_EXEC_RENDER;
941 }
942 if (batch->needs_sol_reset)
943 flags |= I915_EXEC_GEN7_SOL_RESET;
944
945 uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
946
947 /* Set statebuffer relocations */
948 const unsigned state_index = batch->state.bo->index;
949 if (state_index < batch->exec_count &&
950 batch->exec_bos[state_index] == batch->state.bo) {
951 struct drm_i915_gem_exec_object2 *entry =
952 &batch->validation_list[state_index];
953 assert(entry->handle == batch->state.bo->gem_handle);
954 entry->relocation_count = batch->state_relocs.reloc_count;
955 entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
956 }
957
958 /* Set batchbuffer relocations */
959 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
960 assert(entry->handle == batch->batch.bo->gem_handle);
961 entry->relocation_count = batch->batch_relocs.reloc_count;
962 entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
963
964 if (batch->use_batch_first) {
965 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
966 } else {
967 /* Move the batch to the end of the validation list */
968 struct drm_i915_gem_exec_object2 tmp;
969 const unsigned index = batch->exec_count - 1;
970
971 tmp = *entry;
972 *entry = batch->validation_list[index];
973 batch->validation_list[index] = tmp;
974 }
975
976 ret = execbuffer(dri_screen->fd, batch, hw_ctx,
977 4 * USED_BATCH(*batch),
978 in_fence_fd, out_fence_fd, flags);
979
980 throttle(brw);
981 }
982
983 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
984 do_batch_dump(brw);
985
986 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
987 brw_check_for_reset(brw);
988
989 if (ret != 0) {
990 fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
991 strerror(-ret));
992 exit(1);
993 }
994
995 return ret;
996 }
997
998 /**
999 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
1000 * of the fd.
1001 *
1002 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
1003 * of the returned fd.
1004 */
1005 int
1006 _intel_batchbuffer_flush_fence(struct brw_context *brw,
1007 int in_fence_fd, int *out_fence_fd,
1008 const char *file, int line)
1009 {
1010 int ret;
1011
1012 if (USED_BATCH(brw->batch) == 0)
1013 return 0;
1014
1015 /* Check that we didn't just wrap our batchbuffer at a bad time. */
1016 assert(!brw->batch.no_wrap);
1017
1018 brw_finish_batch(brw);
1019 brw_upload_finish(&brw->upload);
1020
1021 finish_growing_bos(&brw->batch.batch);
1022 finish_growing_bos(&brw->batch.state);
1023
1024 if (brw->throttle_batch[0] == NULL) {
1025 brw->throttle_batch[0] = brw->batch.batch.bo;
1026 brw_bo_reference(brw->throttle_batch[0]);
1027 }
1028
1029 if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT))) {
1030 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
1031 int bytes_for_state = brw->batch.state_used;
1032 fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
1033 " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
1034 " %4d batch relocs, %4d state relocs\n", file, line,
1035 bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
1036 bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
1037 brw->batch.exec_count,
1038 (float) brw->batch.aperture_space / (1024 * 1024),
1039 brw->batch.batch_relocs.reloc_count,
1040 brw->batch.state_relocs.reloc_count);
1041 }
1042
1043 ret = submit_batch(brw, in_fence_fd, out_fence_fd);
1044
1045 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
1046 fprintf(stderr, "waiting for idle\n");
1047 brw_bo_wait_rendering(brw->batch.batch.bo);
1048 }
1049
1050 /* Start a new batch buffer. */
1051 brw_new_batch(brw);
1052
1053 return ret;
1054 }
1055
1056 bool
1057 brw_batch_has_aperture_space(struct brw_context *brw, unsigned extra_space)
1058 {
1059 return brw->batch.aperture_space + extra_space <=
1060 brw->screen->aperture_threshold;
1061 }
1062
1063 bool
1064 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
1065 {
1066 unsigned index = READ_ONCE(bo->index);
1067 if (index < batch->exec_count && batch->exec_bos[index] == bo)
1068 return true;
1069
1070 for (int i = 0; i < batch->exec_count; i++) {
1071 if (batch->exec_bos[i] == bo)
1072 return true;
1073 }
1074 return false;
1075 }
1076
1077 /* This is the only way buffers get added to the validate list.
1078 */
1079 static uint64_t
1080 emit_reloc(struct intel_batchbuffer *batch,
1081 struct brw_reloc_list *rlist, uint32_t offset,
1082 struct brw_bo *target, int32_t target_offset,
1083 unsigned int reloc_flags)
1084 {
1085 assert(target != NULL);
1086
1087 if (rlist->reloc_count == rlist->reloc_array_size) {
1088 rlist->reloc_array_size *= 2;
1089 rlist->relocs = realloc(rlist->relocs,
1090 rlist->reloc_array_size *
1091 sizeof(struct drm_i915_gem_relocation_entry));
1092 }
1093
1094 unsigned int index = add_exec_bo(batch, target);
1095 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
1096
1097 if (reloc_flags & RELOC_32BIT) {
1098 /* Restrict this buffer to the low 32 bits of the address space.
1099 *
1100 * Altering the validation list flags restricts it for this batch,
1101 * but we also alter the BO's kflags to restrict it permanently
1102 * (until the BO is destroyed and put back in the cache). Buffers
1103 * may stay bound across batches, and we want keep it constrained.
1104 */
1105 target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
1106 entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
1107
1108 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
1109 reloc_flags &= ~RELOC_32BIT;
1110 }
1111
1112 if (reloc_flags)
1113 entry->flags |= reloc_flags & batch->valid_reloc_flags;
1114
1115 rlist->relocs[rlist->reloc_count++] =
1116 (struct drm_i915_gem_relocation_entry) {
1117 .offset = offset,
1118 .delta = target_offset,
1119 .target_handle = batch->use_batch_first ? index : target->gem_handle,
1120 .presumed_offset = entry->offset,
1121 };
1122
1123 /* Using the old buffer offset, write in what the right data would be, in
1124 * case the buffer doesn't move and we can short-circuit the relocation
1125 * processing in the kernel
1126 */
1127 return entry->offset + target_offset;
1128 }
1129
1130 uint64_t
1131 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
1132 struct brw_bo *target, uint32_t target_offset,
1133 unsigned int reloc_flags)
1134 {
1135 assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
1136
1137 return emit_reloc(batch, &batch->batch_relocs, batch_offset,
1138 target, target_offset, reloc_flags);
1139 }
1140
1141 uint64_t
1142 brw_state_reloc(struct intel_batchbuffer *batch, uint32_t state_offset,
1143 struct brw_bo *target, uint32_t target_offset,
1144 unsigned int reloc_flags)
1145 {
1146 assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
1147
1148 return emit_reloc(batch, &batch->state_relocs, state_offset,
1149 target, target_offset, reloc_flags);
1150 }
1151
1152
1153 uint32_t
1154 brw_state_batch_size(struct brw_context *brw, uint32_t offset)
1155 {
1156 struct hash_entry *entry =
1157 _mesa_hash_table_search(brw->batch.state_batch_sizes,
1158 (void *) (uintptr_t) offset);
1159 return entry ? (uintptr_t) entry->data : 0;
1160 }
1161
1162 /**
1163 * Reserve some space in the statebuffer, or flush.
1164 *
1165 * This is used to estimate when we're near the end of the batch,
1166 * so we can flush early.
1167 */
1168 void
1169 brw_require_statebuffer_space(struct brw_context *brw, int size)
1170 {
1171 if (brw->batch.state_used + size >= STATE_SZ)
1172 intel_batchbuffer_flush(brw);
1173 }
1174
1175 /**
1176 * Allocates a block of space in the batchbuffer for indirect state.
1177 */
1178 void *
1179 brw_state_batch(struct brw_context *brw,
1180 int size,
1181 int alignment,
1182 uint32_t *out_offset)
1183 {
1184 struct intel_batchbuffer *batch = &brw->batch;
1185
1186 assert(size < batch->state.bo->size);
1187
1188 uint32_t offset = ALIGN(batch->state_used, alignment);
1189
1190 if (offset + size >= STATE_SZ && !batch->no_wrap) {
1191 intel_batchbuffer_flush(brw);
1192 offset = ALIGN(batch->state_used, alignment);
1193 } else if (offset + size >= batch->state.bo->size) {
1194 const unsigned new_size =
1195 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
1196 MAX_STATE_SIZE);
1197 grow_buffer(brw, &batch->state, batch->state_used, new_size);
1198 assert(offset + size < batch->state.bo->size);
1199 }
1200
1201 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
1202 _mesa_hash_table_insert(batch->state_batch_sizes,
1203 (void *) (uintptr_t) offset,
1204 (void *) (uintptr_t) size);
1205 }
1206
1207 batch->state_used = offset + size;
1208
1209 *out_offset = offset;
1210 return batch->state.map + (offset >> 2);
1211 }
1212
1213 void
1214 intel_batchbuffer_data(struct brw_context *brw,
1215 const void *data, GLuint bytes, enum brw_gpu_ring ring)
1216 {
1217 assert((bytes & 3) == 0);
1218 intel_batchbuffer_require_space(brw, bytes, ring);
1219 memcpy(brw->batch.map_next, data, bytes);
1220 brw->batch.map_next += bytes >> 2;
1221 }
1222
1223 static void
1224 load_sized_register_mem(struct brw_context *brw,
1225 uint32_t reg,
1226 struct brw_bo *bo,
1227 uint32_t offset,
1228 int size)
1229 {
1230 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1231 int i;
1232
1233 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
1234 assert(devinfo->gen >= 7);
1235
1236 if (devinfo->gen >= 8) {
1237 BEGIN_BATCH(4 * size);
1238 for (i = 0; i < size; i++) {
1239 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
1240 OUT_BATCH(reg + i * 4);
1241 OUT_RELOC64(bo, 0, offset + i * 4);
1242 }
1243 ADVANCE_BATCH();
1244 } else {
1245 BEGIN_BATCH(3 * size);
1246 for (i = 0; i < size; i++) {
1247 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
1248 OUT_BATCH(reg + i * 4);
1249 OUT_RELOC(bo, 0, offset + i * 4);
1250 }
1251 ADVANCE_BATCH();
1252 }
1253 }
1254
1255 void
1256 brw_load_register_mem(struct brw_context *brw,
1257 uint32_t reg,
1258 struct brw_bo *bo,
1259 uint32_t offset)
1260 {
1261 load_sized_register_mem(brw, reg, bo, offset, 1);
1262 }
1263
1264 void
1265 brw_load_register_mem64(struct brw_context *brw,
1266 uint32_t reg,
1267 struct brw_bo *bo,
1268 uint32_t offset)
1269 {
1270 load_sized_register_mem(brw, reg, bo, offset, 2);
1271 }
1272
1273 /*
1274 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1275 */
1276 void
1277 brw_store_register_mem32(struct brw_context *brw,
1278 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1279 {
1280 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1281
1282 assert(devinfo->gen >= 6);
1283
1284 if (devinfo->gen >= 8) {
1285 BEGIN_BATCH(4);
1286 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1287 OUT_BATCH(reg);
1288 OUT_RELOC64(bo, RELOC_WRITE, offset);
1289 ADVANCE_BATCH();
1290 } else {
1291 BEGIN_BATCH(3);
1292 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1293 OUT_BATCH(reg);
1294 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1295 ADVANCE_BATCH();
1296 }
1297 }
1298
1299 /*
1300 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1301 */
1302 void
1303 brw_store_register_mem64(struct brw_context *brw,
1304 struct brw_bo *bo, uint32_t reg, uint32_t offset)
1305 {
1306 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1307
1308 assert(devinfo->gen >= 6);
1309
1310 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1311 * read a full 64-bit register, we need to do two of them.
1312 */
1313 if (devinfo->gen >= 8) {
1314 BEGIN_BATCH(8);
1315 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1316 OUT_BATCH(reg);
1317 OUT_RELOC64(bo, RELOC_WRITE, offset);
1318 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1319 OUT_BATCH(reg + sizeof(uint32_t));
1320 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1321 ADVANCE_BATCH();
1322 } else {
1323 BEGIN_BATCH(6);
1324 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1325 OUT_BATCH(reg);
1326 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1327 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1328 OUT_BATCH(reg + sizeof(uint32_t));
1329 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1330 ADVANCE_BATCH();
1331 }
1332 }
1333
1334 /*
1335 * Write a 32-bit register using immediate data.
1336 */
1337 void
1338 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1339 {
1340 assert(brw->screen->devinfo.gen >= 6);
1341
1342 BEGIN_BATCH(3);
1343 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1344 OUT_BATCH(reg);
1345 OUT_BATCH(imm);
1346 ADVANCE_BATCH();
1347 }
1348
1349 /*
1350 * Write a 64-bit register using immediate data.
1351 */
1352 void
1353 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1354 {
1355 assert(brw->screen->devinfo.gen >= 6);
1356
1357 BEGIN_BATCH(5);
1358 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1359 OUT_BATCH(reg);
1360 OUT_BATCH(imm & 0xffffffff);
1361 OUT_BATCH(reg + 4);
1362 OUT_BATCH(imm >> 32);
1363 ADVANCE_BATCH();
1364 }
1365
1366 /*
1367 * Copies a 32-bit register.
1368 */
1369 void
1370 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
1371 {
1372 assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1373
1374 BEGIN_BATCH(3);
1375 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1376 OUT_BATCH(src);
1377 OUT_BATCH(dest);
1378 ADVANCE_BATCH();
1379 }
1380
1381 /*
1382 * Copies a 64-bit register.
1383 */
1384 void
1385 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
1386 {
1387 assert(brw->screen->devinfo.gen >= 8 || brw->screen->devinfo.is_haswell);
1388
1389 BEGIN_BATCH(6);
1390 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1391 OUT_BATCH(src);
1392 OUT_BATCH(dest);
1393 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1394 OUT_BATCH(src + sizeof(uint32_t));
1395 OUT_BATCH(dest + sizeof(uint32_t));
1396 ADVANCE_BATCH();
1397 }
1398
1399 /*
1400 * Write 32-bits of immediate data to a GPU memory buffer.
1401 */
1402 void
1403 brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1404 uint32_t offset, uint32_t imm)
1405 {
1406 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1407
1408 assert(devinfo->gen >= 6);
1409
1410 BEGIN_BATCH(4);
1411 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1412 if (devinfo->gen >= 8)
1413 OUT_RELOC64(bo, RELOC_WRITE, offset);
1414 else {
1415 OUT_BATCH(0); /* MBZ */
1416 OUT_RELOC(bo, RELOC_WRITE, offset);
1417 }
1418 OUT_BATCH(imm);
1419 ADVANCE_BATCH();
1420 }
1421
1422 /*
1423 * Write 64-bits of immediate data to a GPU memory buffer.
1424 */
1425 void
1426 brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1427 uint32_t offset, uint64_t imm)
1428 {
1429 const struct gen_device_info *devinfo = &brw->screen->devinfo;
1430
1431 assert(devinfo->gen >= 6);
1432
1433 BEGIN_BATCH(5);
1434 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1435 if (devinfo->gen >= 8)
1436 OUT_RELOC64(bo, RELOC_WRITE, offset);
1437 else {
1438 OUT_BATCH(0); /* MBZ */
1439 OUT_RELOC(bo, RELOC_WRITE, offset);
1440 }
1441 OUT_BATCH(imm & 0xffffffffu);
1442 OUT_BATCH(imm >> 32);
1443 ADVANCE_BATCH();
1444 }