49a00c12ba2bd9660e50b6a178641312de19923e
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "intel_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35
36 #include "util/hash_table.h"
37
38 #include <xf86drm.h>
39 #include <i915_drm.h>
40
41 static void
42 intel_batchbuffer_reset(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
43 bool has_llc);
44
45 static bool
46 uint_key_compare(const void *a, const void *b)
47 {
48 return a == b;
49 }
50
51 static uint32_t
52 uint_key_hash(const void *key)
53 {
54 return (uintptr_t) key;
55 }
56
57 void
58 intel_batchbuffer_init(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
59 bool has_llc)
60 {
61 intel_batchbuffer_reset(batch, bufmgr, has_llc);
62
63 if (!has_llc) {
64 batch->cpu_map = malloc(BATCH_SZ);
65 batch->map = batch->cpu_map;
66 batch->map_next = batch->cpu_map;
67 }
68
69 if (INTEL_DEBUG & DEBUG_BATCH) {
70 batch->state_batch_sizes =
71 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
72 }
73 }
74
75 static void
76 intel_batchbuffer_reset(struct intel_batchbuffer *batch, dri_bufmgr *bufmgr,
77 bool has_llc)
78 {
79 if (batch->last_bo != NULL) {
80 drm_intel_bo_unreference(batch->last_bo);
81 batch->last_bo = NULL;
82 }
83 batch->last_bo = batch->bo;
84
85 batch->bo = drm_intel_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
86 if (has_llc) {
87 drm_intel_bo_map(batch->bo, true);
88 batch->map = batch->bo->virtual;
89 }
90 batch->map_next = batch->map;
91
92 batch->reserved_space = BATCH_RESERVED;
93 batch->state_batch_offset = batch->bo->size;
94 batch->needs_sol_reset = false;
95 batch->state_base_address_emitted = false;
96
97 /* We don't know what ring the new batch will be sent to until we see the
98 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
99 */
100 batch->ring = UNKNOWN_RING;
101
102 if (batch->state_batch_sizes)
103 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
104 }
105
106 static void
107 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
108 {
109 intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
110 brw_render_cache_set_clear(brw);
111 }
112
113 void
114 intel_batchbuffer_save_state(struct brw_context *brw)
115 {
116 brw->batch.saved.map_next = brw->batch.map_next;
117 brw->batch.saved.reloc_count =
118 drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
119 }
120
121 void
122 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
123 {
124 drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
125
126 brw->batch.map_next = brw->batch.saved.map_next;
127 if (USED_BATCH(brw->batch) == 0)
128 brw->batch.ring = UNKNOWN_RING;
129 }
130
131 void
132 intel_batchbuffer_free(struct intel_batchbuffer *batch)
133 {
134 free(batch->cpu_map);
135 drm_intel_bo_unreference(batch->last_bo);
136 drm_intel_bo_unreference(batch->bo);
137 if (batch->state_batch_sizes)
138 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
139 }
140
141 void
142 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
143 enum brw_gpu_ring ring)
144 {
145 /* If we're switching rings, implicitly flush the batch. */
146 if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
147 brw->gen >= 6) {
148 intel_batchbuffer_flush(brw);
149 }
150
151 #ifdef DEBUG
152 assert(sz < BATCH_SZ - BATCH_RESERVED);
153 #endif
154 if (intel_batchbuffer_space(&brw->batch) < sz)
155 intel_batchbuffer_flush(brw);
156
157 /* The intel_batchbuffer_flush() calls above might have changed
158 * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
159 */
160 brw->batch.ring = ring;
161 }
162
163 #ifdef DEBUG
164 #define CSI "\e["
165 #define BLUE_HEADER CSI "0;44m"
166 #define NORMAL CSI "0m"
167
168
169 static void
170 decode_struct(struct brw_context *brw, struct gen_spec *spec,
171 const char *struct_name, uint32_t *data,
172 uint32_t gtt_offset, uint32_t offset, bool color)
173 {
174 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
175 if (!group)
176 return;
177
178 fprintf(stderr, "%s\n", struct_name);
179 gen_print_group(stderr, group, gtt_offset + offset,
180 &data[offset / 4], 0, color);
181 }
182
183 static void
184 decode_structs(struct brw_context *brw, struct gen_spec *spec,
185 const char *struct_name,
186 uint32_t *data, uint32_t gtt_offset, uint32_t offset,
187 int struct_size, bool color)
188 {
189 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
190 if (!group)
191 return;
192
193 int entries = brw_state_batch_size(brw, offset) / struct_size;
194 for (int i = 0; i < entries; i++) {
195 fprintf(stderr, "%s %d\n", struct_name, i);
196 gen_print_group(stderr, group, gtt_offset + offset,
197 &data[(offset + i * struct_size) / 4], 0, color);
198 }
199 }
200
201 static void
202 do_batch_dump(struct brw_context *brw)
203 {
204 struct intel_batchbuffer *batch = &brw->batch;
205 struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
206
207 if (batch->ring != RENDER_RING)
208 return;
209
210 int ret = drm_intel_bo_map(batch->bo, false);
211 if (ret != 0) {
212 fprintf(stderr,
213 "WARNING: failed to map batchbuffer (%s), "
214 "dumping uploaded data instead.\n", strerror(ret));
215 }
216
217 uint32_t *data = batch->bo->virtual ? batch->bo->virtual : batch->map;
218 uint32_t *end = data + USED_BATCH(*batch);
219 uint32_t gtt_offset = batch->bo->virtual ? batch->bo->offset64 : 0;
220 unsigned int length;
221
222 bool color = INTEL_DEBUG & DEBUG_COLOR;
223 const char *header_color = color ? BLUE_HEADER : "";
224 const char *reset_color = color ? NORMAL : "";
225
226 for (uint32_t *p = data; p < end; p += length) {
227 struct gen_group *inst = gen_spec_find_instruction(spec, p);
228 if (inst == NULL) {
229 fprintf(stderr, "unknown instruction %08x\n", p[0]);
230 length = (p[0] & 0xff) + 2;
231 continue;
232 }
233
234 uint64_t offset = gtt_offset + 4 * (p - data);
235
236 fprintf(stderr, "%s0x%08"PRIx64": 0x%08x: %-80s%s\n", header_color,
237 offset, p[0], gen_group_get_name(inst), reset_color);
238
239 gen_print_group(stderr, inst, offset, p, 1, color);
240
241 switch (gen_group_get_opcode(inst) >> 16) {
242 case _3DSTATE_PIPELINED_POINTERS:
243 /* TODO: Decode Gen4-5 pipelined pointers */
244 break;
245 case _3DSTATE_BINDING_TABLE_POINTERS_VS:
246 case _3DSTATE_BINDING_TABLE_POINTERS_HS:
247 case _3DSTATE_BINDING_TABLE_POINTERS_DS:
248 case _3DSTATE_BINDING_TABLE_POINTERS_GS:
249 case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
250 struct gen_group *group =
251 gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
252 if (!group)
253 break;
254
255 uint32_t bt_offset = p[1] & ~0x1fu;
256 int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
257 uint32_t *bt_pointers = &data[bt_offset / 4];
258 for (int i = 0; i < bt_entries; i++) {
259 fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
260 gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
261 &data[bt_pointers[i] / 4], 0, color);
262 }
263 break;
264 }
265 case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
266 case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
267 case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
268 case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
269 case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
270 decode_structs(brw, spec, "SAMPLER_STATE", data,
271 gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
272 break;
273 case _3DSTATE_VIEWPORT_STATE_POINTERS:
274 decode_structs(brw, spec, "CLIP_VIEWPORT", data,
275 gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
276 decode_structs(brw, spec, "SF_VIEWPORT", data,
277 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
278 decode_structs(brw, spec, "CC_VIEWPORT", data,
279 gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
280 break;
281 case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
282 decode_structs(brw, spec, "CC_VIEWPORT", data,
283 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
284 break;
285 case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
286 decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
287 gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
288 break;
289 case _3DSTATE_SCISSOR_STATE_POINTERS:
290 decode_structs(brw, spec, "SCISSOR_RECT", data,
291 gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
292 break;
293 case _3DSTATE_BLEND_STATE_POINTERS:
294 /* TODO: handle Gen8+ extra dword at the beginning */
295 decode_structs(brw, spec, "BLEND_STATE", data,
296 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
297 break;
298 case _3DSTATE_CC_STATE_POINTERS:
299 if (brw->gen >= 7) {
300 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
301 gtt_offset, p[1] & ~0x3fu, color);
302 } else if (brw->gen == 6) {
303 decode_structs(brw, spec, "BLEND_STATE", data,
304 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
305 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
306 gtt_offset, p[2] & ~0x3fu, color);
307 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
308 gtt_offset, p[3] & ~0x3fu, color);
309 }
310 break;
311 case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
312 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
313 gtt_offset, p[1] & ~0x3fu, color);
314 break;
315 }
316
317 length = gen_group_get_length(inst, p);
318 }
319
320 if (ret == 0) {
321 drm_intel_bo_unmap(batch->bo);
322 }
323 }
324 #else
325 static void do_batch_dump(struct brw_context *brw) { }
326 #endif
327
328 /**
329 * Called when starting a new batch buffer.
330 */
331 static void
332 brw_new_batch(struct brw_context *brw)
333 {
334 /* Create a new batchbuffer and reset the associated state: */
335 drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
336 intel_batchbuffer_reset_and_clear_render_cache(brw);
337
338 /* If the kernel supports hardware contexts, then most hardware state is
339 * preserved between batches; we only need to re-emit state that is required
340 * to be in every batch. Otherwise we need to re-emit all the state that
341 * would otherwise be stored in the context (which for all intents and
342 * purposes means everything).
343 */
344 if (brw->hw_ctx == NULL)
345 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
346
347 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
348
349 brw->ib.type = -1;
350
351 /* We need to periodically reap the shader time results, because rollover
352 * happens every few seconds. We also want to see results every once in a
353 * while, because many programs won't cleanly destroy our context, so the
354 * end-of-run printout may not happen.
355 */
356 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
357 brw_collect_and_report_shader_time(brw);
358 }
359
360 /**
361 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
362 * sending it off.
363 *
364 * This function can emit state (say, to preserve registers that aren't saved
365 * between batches). All of this state MUST fit in the reserved space at the
366 * end of the batchbuffer. If you add more GPU state, increase the reserved
367 * space by updating the BATCH_RESERVED macro.
368 */
369 static void
370 brw_finish_batch(struct brw_context *brw)
371 {
372 /* Capture the closing pipeline statistics register values necessary to
373 * support query objects (in the non-hardware context world).
374 */
375 brw_emit_query_end(brw);
376
377 if (brw->batch.ring == RENDER_RING) {
378 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
379 * assume that the L3 cache is configured according to the hardware
380 * defaults.
381 */
382 if (brw->gen >= 7)
383 gen7_restore_default_l3_config(brw);
384
385 if (brw->is_haswell) {
386 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
387 * 3DSTATE_CC_STATE_POINTERS > "Note":
388 *
389 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
390 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
391 *
392 * From the example in the docs, it seems to expect a regular pipe control
393 * flush here as well. We may have done it already, but meh.
394 *
395 * See also WaAvoidRCZCounterRollover.
396 */
397 brw_emit_mi_flush(brw);
398 BEGIN_BATCH(2);
399 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
400 OUT_BATCH(brw->cc.state_offset | 1);
401 ADVANCE_BATCH();
402 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
403 PIPE_CONTROL_CS_STALL);
404 }
405 }
406
407 /* Mark that the current program cache BO has been used by the GPU.
408 * It will be reallocated if we need to put new programs in for the
409 * next batch.
410 */
411 brw->cache.bo_used_by_gpu = true;
412 }
413
414 static void
415 throttle(struct brw_context *brw)
416 {
417 /* Wait for the swapbuffers before the one we just emitted, so we
418 * don't get too many swaps outstanding for apps that are GPU-heavy
419 * but not CPU-heavy.
420 *
421 * We're using intelDRI2Flush (called from the loader before
422 * swapbuffer) and glFlush (for front buffer rendering) as the
423 * indicator that a frame is done and then throttle when we get
424 * here as we prepare to render the next frame. At this point for
425 * round trips for swap/copy and getting new buffers are done and
426 * we'll spend less time waiting on the GPU.
427 *
428 * Unfortunately, we don't have a handle to the batch containing
429 * the swap, and getting our hands on that doesn't seem worth it,
430 * so we just use the first batch we emitted after the last swap.
431 */
432 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
433 if (brw->throttle_batch[1]) {
434 if (!brw->disable_throttling)
435 drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
436 drm_intel_bo_unreference(brw->throttle_batch[1]);
437 }
438 brw->throttle_batch[1] = brw->throttle_batch[0];
439 brw->throttle_batch[0] = NULL;
440 brw->need_swap_throttle = false;
441 /* Throttling here is more precise than the throttle ioctl, so skip it */
442 brw->need_flush_throttle = false;
443 }
444
445 if (brw->need_flush_throttle) {
446 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
447 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
448 brw->need_flush_throttle = false;
449 }
450 }
451
452 /* TODO: Push this whole function into bufmgr.
453 */
454 static int
455 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
456 {
457 struct intel_batchbuffer *batch = &brw->batch;
458 int ret = 0;
459
460 if (brw->has_llc) {
461 drm_intel_bo_unmap(batch->bo);
462 } else {
463 ret = drm_intel_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
464 if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
465 ret = drm_intel_bo_subdata(batch->bo,
466 batch->state_batch_offset,
467 batch->bo->size - batch->state_batch_offset,
468 (char *)batch->map + batch->state_batch_offset);
469 }
470 }
471
472 if (!brw->screen->no_hw) {
473 int flags;
474
475 if (brw->gen >= 6 && batch->ring == BLT_RING) {
476 flags = I915_EXEC_BLT;
477 } else {
478 flags = I915_EXEC_RENDER;
479 }
480 if (batch->needs_sol_reset)
481 flags |= I915_EXEC_GEN7_SOL_RESET;
482
483 if (ret == 0) {
484 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
485 assert(in_fence_fd == -1);
486 assert(out_fence_fd == NULL);
487 ret = drm_intel_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
488 NULL, 0, 0, flags);
489 } else {
490 ret = drm_intel_gem_bo_fence_exec(batch->bo, brw->hw_ctx,
491 4 * USED_BATCH(*batch),
492 in_fence_fd, out_fence_fd,
493 flags);
494 }
495 }
496
497 throttle(brw);
498 }
499
500 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
501 do_batch_dump(brw);
502
503 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
504 brw_check_for_reset(brw);
505
506 if (ret != 0) {
507 fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
508 exit(1);
509 }
510
511 return ret;
512 }
513
514 /**
515 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
516 * of the fd.
517 *
518 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
519 * of the returned fd.
520 */
521 int
522 _intel_batchbuffer_flush_fence(struct brw_context *brw,
523 int in_fence_fd, int *out_fence_fd,
524 const char *file, int line)
525 {
526 int ret;
527
528 if (USED_BATCH(brw->batch) == 0)
529 return 0;
530
531 if (brw->throttle_batch[0] == NULL) {
532 brw->throttle_batch[0] = brw->batch.bo;
533 drm_intel_bo_reference(brw->throttle_batch[0]);
534 }
535
536 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
537 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
538 int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
539 int total_bytes = bytes_for_commands + bytes_for_state;
540 fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
541 "%4db (state) = %4db (%0.1f%%)\n", file, line,
542 bytes_for_commands, bytes_for_state,
543 total_bytes,
544 100.0f * total_bytes / BATCH_SZ);
545 }
546
547 brw->batch.reserved_space = 0;
548
549 brw_finish_batch(brw);
550
551 /* Mark the end of the buffer. */
552 intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
553 if (USED_BATCH(brw->batch) & 1) {
554 /* Round batchbuffer usage to 2 DWORDs. */
555 intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
556 }
557
558 intel_upload_finish(brw);
559
560 /* Check that we didn't just wrap our batchbuffer at a bad time. */
561 assert(!brw->no_batch_wrap);
562
563 ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
564
565 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
566 fprintf(stderr, "waiting for idle\n");
567 drm_intel_bo_wait_rendering(brw->batch.bo);
568 }
569
570 /* Start a new batch buffer. */
571 brw_new_batch(brw);
572
573 return ret;
574 }
575
576
577 /* This is the only way buffers get added to the validate list.
578 */
579 uint32_t
580 intel_batchbuffer_reloc(struct intel_batchbuffer *batch,
581 drm_intel_bo *buffer, uint32_t offset,
582 uint32_t read_domains, uint32_t write_domain,
583 uint32_t delta)
584 {
585 int ret;
586
587 ret = drm_intel_bo_emit_reloc(batch->bo, offset,
588 buffer, delta,
589 read_domains, write_domain);
590 assert(ret == 0);
591 (void)ret;
592
593 /* Using the old buffer offset, write in what the right data would be, in
594 * case the buffer doesn't move and we can short-circuit the relocation
595 * processing in the kernel
596 */
597 return buffer->offset64 + delta;
598 }
599
600 uint64_t
601 intel_batchbuffer_reloc64(struct intel_batchbuffer *batch,
602 drm_intel_bo *buffer, uint32_t offset,
603 uint32_t read_domains, uint32_t write_domain,
604 uint32_t delta)
605 {
606 int ret = drm_intel_bo_emit_reloc(batch->bo, offset,
607 buffer, delta,
608 read_domains, write_domain);
609 assert(ret == 0);
610 (void) ret;
611
612 /* Using the old buffer offset, write in what the right data would be, in
613 * case the buffer doesn't move and we can short-circuit the relocation
614 * processing in the kernel
615 */
616 return buffer->offset64 + delta;
617 }
618
619
620 void
621 intel_batchbuffer_data(struct brw_context *brw,
622 const void *data, GLuint bytes, enum brw_gpu_ring ring)
623 {
624 assert((bytes & 3) == 0);
625 intel_batchbuffer_require_space(brw, bytes, ring);
626 memcpy(brw->batch.map_next, data, bytes);
627 brw->batch.map_next += bytes >> 2;
628 }
629
630 static void
631 load_sized_register_mem(struct brw_context *brw,
632 uint32_t reg,
633 drm_intel_bo *bo,
634 uint32_t read_domains, uint32_t write_domain,
635 uint32_t offset,
636 int size)
637 {
638 int i;
639
640 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
641 assert(brw->gen >= 7);
642
643 if (brw->gen >= 8) {
644 BEGIN_BATCH(4 * size);
645 for (i = 0; i < size; i++) {
646 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
647 OUT_BATCH(reg + i * 4);
648 OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
649 }
650 ADVANCE_BATCH();
651 } else {
652 BEGIN_BATCH(3 * size);
653 for (i = 0; i < size; i++) {
654 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
655 OUT_BATCH(reg + i * 4);
656 OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
657 }
658 ADVANCE_BATCH();
659 }
660 }
661
662 void
663 brw_load_register_mem(struct brw_context *brw,
664 uint32_t reg,
665 drm_intel_bo *bo,
666 uint32_t read_domains, uint32_t write_domain,
667 uint32_t offset)
668 {
669 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
670 }
671
672 void
673 brw_load_register_mem64(struct brw_context *brw,
674 uint32_t reg,
675 drm_intel_bo *bo,
676 uint32_t read_domains, uint32_t write_domain,
677 uint32_t offset)
678 {
679 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
680 }
681
682 /*
683 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
684 */
685 void
686 brw_store_register_mem32(struct brw_context *brw,
687 drm_intel_bo *bo, uint32_t reg, uint32_t offset)
688 {
689 assert(brw->gen >= 6);
690
691 if (brw->gen >= 8) {
692 BEGIN_BATCH(4);
693 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
694 OUT_BATCH(reg);
695 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
696 offset);
697 ADVANCE_BATCH();
698 } else {
699 BEGIN_BATCH(3);
700 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
701 OUT_BATCH(reg);
702 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
703 offset);
704 ADVANCE_BATCH();
705 }
706 }
707
708 /*
709 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
710 */
711 void
712 brw_store_register_mem64(struct brw_context *brw,
713 drm_intel_bo *bo, uint32_t reg, uint32_t offset)
714 {
715 assert(brw->gen >= 6);
716
717 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
718 * read a full 64-bit register, we need to do two of them.
719 */
720 if (brw->gen >= 8) {
721 BEGIN_BATCH(8);
722 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
723 OUT_BATCH(reg);
724 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
725 offset);
726 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
727 OUT_BATCH(reg + sizeof(uint32_t));
728 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
729 offset + sizeof(uint32_t));
730 ADVANCE_BATCH();
731 } else {
732 BEGIN_BATCH(6);
733 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
734 OUT_BATCH(reg);
735 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
736 offset);
737 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
738 OUT_BATCH(reg + sizeof(uint32_t));
739 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
740 offset + sizeof(uint32_t));
741 ADVANCE_BATCH();
742 }
743 }
744
745 /*
746 * Write a 32-bit register using immediate data.
747 */
748 void
749 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
750 {
751 assert(brw->gen >= 6);
752
753 BEGIN_BATCH(3);
754 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
755 OUT_BATCH(reg);
756 OUT_BATCH(imm);
757 ADVANCE_BATCH();
758 }
759
760 /*
761 * Write a 64-bit register using immediate data.
762 */
763 void
764 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
765 {
766 assert(brw->gen >= 6);
767
768 BEGIN_BATCH(5);
769 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
770 OUT_BATCH(reg);
771 OUT_BATCH(imm & 0xffffffff);
772 OUT_BATCH(reg + 4);
773 OUT_BATCH(imm >> 32);
774 ADVANCE_BATCH();
775 }
776
777 /*
778 * Copies a 32-bit register.
779 */
780 void
781 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
782 {
783 assert(brw->gen >= 8 || brw->is_haswell);
784
785 BEGIN_BATCH(3);
786 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
787 OUT_BATCH(src);
788 OUT_BATCH(dest);
789 ADVANCE_BATCH();
790 }
791
792 /*
793 * Copies a 64-bit register.
794 */
795 void
796 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
797 {
798 assert(brw->gen >= 8 || brw->is_haswell);
799
800 BEGIN_BATCH(6);
801 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
802 OUT_BATCH(src);
803 OUT_BATCH(dest);
804 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
805 OUT_BATCH(src + sizeof(uint32_t));
806 OUT_BATCH(dest + sizeof(uint32_t));
807 ADVANCE_BATCH();
808 }
809
810 /*
811 * Write 32-bits of immediate data to a GPU memory buffer.
812 */
813 void
814 brw_store_data_imm32(struct brw_context *brw, drm_intel_bo *bo,
815 uint32_t offset, uint32_t imm)
816 {
817 assert(brw->gen >= 6);
818
819 BEGIN_BATCH(4);
820 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
821 if (brw->gen >= 8)
822 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
823 offset);
824 else {
825 OUT_BATCH(0); /* MBZ */
826 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
827 offset);
828 }
829 OUT_BATCH(imm);
830 ADVANCE_BATCH();
831 }
832
833 /*
834 * Write 64-bits of immediate data to a GPU memory buffer.
835 */
836 void
837 brw_store_data_imm64(struct brw_context *brw, drm_intel_bo *bo,
838 uint32_t offset, uint64_t imm)
839 {
840 assert(brw->gen >= 6);
841
842 BEGIN_BATCH(5);
843 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
844 if (brw->gen >= 8)
845 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
846 offset);
847 else {
848 OUT_BATCH(0); /* MBZ */
849 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
850 offset);
851 }
852 OUT_BATCH(imm & 0xffffffffu);
853 OUT_BATCH(imm >> 32);
854 ADVANCE_BATCH();
855 }