ff154cce1c50b5bc8bdbcfc8130269d249484d80
[mesa.git] / src / mesa / drivers / dri / i965 / intel_batchbuffer.c
1 /*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "intel_batchbuffer.h"
27 #include "intel_buffer_objects.h"
28 #include "brw_bufmgr.h"
29 #include "intel_buffers.h"
30 #include "intel_fbo.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_state.h"
34 #include "common/gen_decoder.h"
35
36 #include "util/hash_table.h"
37
38 #include <xf86drm.h>
39 #include <i915_drm.h>
40
41 static void
42 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
43 drm_bacon_bufmgr *bufmgr,
44 bool has_llc);
45
46 static bool
47 uint_key_compare(const void *a, const void *b)
48 {
49 return a == b;
50 }
51
52 static uint32_t
53 uint_key_hash(const void *key)
54 {
55 return (uintptr_t) key;
56 }
57
58 void
59 intel_batchbuffer_init(struct intel_batchbuffer *batch,
60 drm_bacon_bufmgr *bufmgr,
61 bool has_llc)
62 {
63 intel_batchbuffer_reset(batch, bufmgr, has_llc);
64
65 if (!has_llc) {
66 batch->cpu_map = malloc(BATCH_SZ);
67 batch->map = batch->cpu_map;
68 batch->map_next = batch->cpu_map;
69 }
70
71 if (INTEL_DEBUG & DEBUG_BATCH) {
72 batch->state_batch_sizes =
73 _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
74 }
75 }
76
77 static void
78 intel_batchbuffer_reset(struct intel_batchbuffer *batch,
79 drm_bacon_bufmgr *bufmgr,
80 bool has_llc)
81 {
82 if (batch->last_bo != NULL) {
83 drm_bacon_bo_unreference(batch->last_bo);
84 batch->last_bo = NULL;
85 }
86 batch->last_bo = batch->bo;
87
88 batch->bo = drm_bacon_bo_alloc(bufmgr, "batchbuffer", BATCH_SZ, 4096);
89 if (has_llc) {
90 drm_bacon_bo_map(batch->bo, true);
91 batch->map = batch->bo->virtual;
92 }
93 batch->map_next = batch->map;
94
95 batch->reserved_space = BATCH_RESERVED;
96 batch->state_batch_offset = batch->bo->size;
97 batch->needs_sol_reset = false;
98 batch->state_base_address_emitted = false;
99
100 /* We don't know what ring the new batch will be sent to until we see the
101 * first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
102 */
103 batch->ring = UNKNOWN_RING;
104
105 if (batch->state_batch_sizes)
106 _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
107 }
108
109 static void
110 intel_batchbuffer_reset_and_clear_render_cache(struct brw_context *brw)
111 {
112 intel_batchbuffer_reset(&brw->batch, brw->bufmgr, brw->has_llc);
113 brw_render_cache_set_clear(brw);
114 }
115
116 void
117 intel_batchbuffer_save_state(struct brw_context *brw)
118 {
119 brw->batch.saved.map_next = brw->batch.map_next;
120 brw->batch.saved.reloc_count =
121 drm_bacon_gem_bo_get_reloc_count(brw->batch.bo);
122 }
123
124 void
125 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
126 {
127 drm_bacon_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
128
129 brw->batch.map_next = brw->batch.saved.map_next;
130 if (USED_BATCH(brw->batch) == 0)
131 brw->batch.ring = UNKNOWN_RING;
132 }
133
134 void
135 intel_batchbuffer_free(struct intel_batchbuffer *batch)
136 {
137 free(batch->cpu_map);
138 drm_bacon_bo_unreference(batch->last_bo);
139 drm_bacon_bo_unreference(batch->bo);
140 if (batch->state_batch_sizes)
141 _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
142 }
143
144 void
145 intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
146 enum brw_gpu_ring ring)
147 {
148 /* If we're switching rings, implicitly flush the batch. */
149 if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
150 brw->gen >= 6) {
151 intel_batchbuffer_flush(brw);
152 }
153
154 #ifdef DEBUG
155 assert(sz < BATCH_SZ - BATCH_RESERVED);
156 #endif
157 if (intel_batchbuffer_space(&brw->batch) < sz)
158 intel_batchbuffer_flush(brw);
159
160 /* The intel_batchbuffer_flush() calls above might have changed
161 * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
162 */
163 brw->batch.ring = ring;
164 }
165
166 #ifdef DEBUG
167 #define CSI "\e["
168 #define BLUE_HEADER CSI "0;44m"
169 #define NORMAL CSI "0m"
170
171
172 static void
173 decode_struct(struct brw_context *brw, struct gen_spec *spec,
174 const char *struct_name, uint32_t *data,
175 uint32_t gtt_offset, uint32_t offset, bool color)
176 {
177 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
178 if (!group)
179 return;
180
181 fprintf(stderr, "%s\n", struct_name);
182 gen_print_group(stderr, group, gtt_offset + offset,
183 &data[offset / 4], color);
184 }
185
186 static void
187 decode_structs(struct brw_context *brw, struct gen_spec *spec,
188 const char *struct_name,
189 uint32_t *data, uint32_t gtt_offset, uint32_t offset,
190 int struct_size, bool color)
191 {
192 struct gen_group *group = gen_spec_find_struct(spec, struct_name);
193 if (!group)
194 return;
195
196 int entries = brw_state_batch_size(brw, offset) / struct_size;
197 for (int i = 0; i < entries; i++) {
198 fprintf(stderr, "%s %d\n", struct_name, i);
199 gen_print_group(stderr, group, gtt_offset + offset,
200 &data[(offset + i * struct_size) / 4], color);
201 }
202 }
203
204 static void
205 do_batch_dump(struct brw_context *brw)
206 {
207 struct intel_batchbuffer *batch = &brw->batch;
208 struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
209
210 if (batch->ring != RENDER_RING)
211 return;
212
213 int ret = drm_bacon_bo_map(batch->bo, false);
214 if (ret != 0) {
215 fprintf(stderr,
216 "WARNING: failed to map batchbuffer (%s), "
217 "dumping uploaded data instead.\n", strerror(ret));
218 }
219
220 uint32_t *data = batch->bo->virtual ? batch->bo->virtual : batch->map;
221 uint32_t *end = data + USED_BATCH(*batch);
222 uint32_t gtt_offset = batch->bo->virtual ? batch->bo->offset64 : 0;
223 int length;
224
225 bool color = INTEL_DEBUG & DEBUG_COLOR;
226 const char *header_color = color ? BLUE_HEADER : "";
227 const char *reset_color = color ? NORMAL : "";
228
229 for (uint32_t *p = data; p < end; p += length) {
230 struct gen_group *inst = gen_spec_find_instruction(spec, p);
231 length = gen_group_get_length(inst, p);
232 assert(inst == NULL || length > 0);
233 length = MAX2(1, length);
234 if (inst == NULL) {
235 fprintf(stderr, "unknown instruction %08x\n", p[0]);
236 continue;
237 }
238
239 uint64_t offset = gtt_offset + 4 * (p - data);
240
241 fprintf(stderr, "%s0x%08"PRIx64": 0x%08x: %-80s%s\n", header_color,
242 offset, p[0], gen_group_get_name(inst), reset_color);
243
244 gen_print_group(stderr, inst, offset, p, color);
245
246 switch (gen_group_get_opcode(inst) >> 16) {
247 case _3DSTATE_PIPELINED_POINTERS:
248 /* TODO: Decode Gen4-5 pipelined pointers */
249 break;
250 case _3DSTATE_BINDING_TABLE_POINTERS_VS:
251 case _3DSTATE_BINDING_TABLE_POINTERS_HS:
252 case _3DSTATE_BINDING_TABLE_POINTERS_DS:
253 case _3DSTATE_BINDING_TABLE_POINTERS_GS:
254 case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
255 struct gen_group *group =
256 gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
257 if (!group)
258 break;
259
260 uint32_t bt_offset = p[1] & ~0x1fu;
261 int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
262 uint32_t *bt_pointers = &data[bt_offset / 4];
263 for (int i = 0; i < bt_entries; i++) {
264 fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
265 gen_print_group(stderr, group, gtt_offset + bt_pointers[i],
266 &data[bt_pointers[i] / 4], color);
267 }
268 break;
269 }
270 case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
271 case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
272 case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
273 case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
274 case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
275 decode_structs(brw, spec, "SAMPLER_STATE", data,
276 gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
277 break;
278 case _3DSTATE_VIEWPORT_STATE_POINTERS:
279 decode_structs(brw, spec, "CLIP_VIEWPORT", data,
280 gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
281 decode_structs(brw, spec, "SF_VIEWPORT", data,
282 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
283 decode_structs(brw, spec, "CC_VIEWPORT", data,
284 gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
285 break;
286 case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
287 decode_structs(brw, spec, "CC_VIEWPORT", data,
288 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
289 break;
290 case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
291 decode_structs(brw, spec, "SF_CLIP_VIEWPORT", data,
292 gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
293 break;
294 case _3DSTATE_SCISSOR_STATE_POINTERS:
295 decode_structs(brw, spec, "SCISSOR_RECT", data,
296 gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
297 break;
298 case _3DSTATE_BLEND_STATE_POINTERS:
299 /* TODO: handle Gen8+ extra dword at the beginning */
300 decode_structs(brw, spec, "BLEND_STATE", data,
301 gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
302 break;
303 case _3DSTATE_CC_STATE_POINTERS:
304 if (brw->gen >= 7) {
305 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
306 gtt_offset, p[1] & ~0x3fu, color);
307 } else if (brw->gen == 6) {
308 decode_structs(brw, spec, "BLEND_STATE", data,
309 gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
310 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
311 gtt_offset, p[2] & ~0x3fu, color);
312 decode_struct(brw, spec, "COLOR_CALC_STATE", data,
313 gtt_offset, p[3] & ~0x3fu, color);
314 }
315 break;
316 case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
317 decode_struct(brw, spec, "DEPTH_STENCIL_STATE", data,
318 gtt_offset, p[1] & ~0x3fu, color);
319 break;
320 }
321 }
322
323 if (ret == 0) {
324 drm_bacon_bo_unmap(batch->bo);
325 }
326 }
327 #else
328 static void do_batch_dump(struct brw_context *brw) { }
329 #endif
330
331 /**
332 * Called when starting a new batch buffer.
333 */
334 static void
335 brw_new_batch(struct brw_context *brw)
336 {
337 /* Create a new batchbuffer and reset the associated state: */
338 drm_bacon_gem_bo_clear_relocs(brw->batch.bo, 0);
339 intel_batchbuffer_reset_and_clear_render_cache(brw);
340
341 /* If the kernel supports hardware contexts, then most hardware state is
342 * preserved between batches; we only need to re-emit state that is required
343 * to be in every batch. Otherwise we need to re-emit all the state that
344 * would otherwise be stored in the context (which for all intents and
345 * purposes means everything).
346 */
347 if (brw->hw_ctx == NULL)
348 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
349
350 brw->ctx.NewDriverState |= BRW_NEW_BATCH;
351
352 brw->ib.type = -1;
353
354 /* We need to periodically reap the shader time results, because rollover
355 * happens every few seconds. We also want to see results every once in a
356 * while, because many programs won't cleanly destroy our context, so the
357 * end-of-run printout may not happen.
358 */
359 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
360 brw_collect_and_report_shader_time(brw);
361 }
362
363 /**
364 * Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
365 * sending it off.
366 *
367 * This function can emit state (say, to preserve registers that aren't saved
368 * between batches). All of this state MUST fit in the reserved space at the
369 * end of the batchbuffer. If you add more GPU state, increase the reserved
370 * space by updating the BATCH_RESERVED macro.
371 */
372 static void
373 brw_finish_batch(struct brw_context *brw)
374 {
375 /* Capture the closing pipeline statistics register values necessary to
376 * support query objects (in the non-hardware context world).
377 */
378 brw_emit_query_end(brw);
379
380 if (brw->batch.ring == RENDER_RING) {
381 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
382 * assume that the L3 cache is configured according to the hardware
383 * defaults.
384 */
385 if (brw->gen >= 7)
386 gen7_restore_default_l3_config(brw);
387
388 if (brw->is_haswell) {
389 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
390 * 3DSTATE_CC_STATE_POINTERS > "Note":
391 *
392 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
393 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
394 *
395 * From the example in the docs, it seems to expect a regular pipe control
396 * flush here as well. We may have done it already, but meh.
397 *
398 * See also WaAvoidRCZCounterRollover.
399 */
400 brw_emit_mi_flush(brw);
401 BEGIN_BATCH(2);
402 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
403 OUT_BATCH(brw->cc.state_offset | 1);
404 ADVANCE_BATCH();
405 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
406 PIPE_CONTROL_CS_STALL);
407 }
408 }
409
410 /* Mark that the current program cache BO has been used by the GPU.
411 * It will be reallocated if we need to put new programs in for the
412 * next batch.
413 */
414 brw->cache.bo_used_by_gpu = true;
415 }
416
417 static void
418 throttle(struct brw_context *brw)
419 {
420 /* Wait for the swapbuffers before the one we just emitted, so we
421 * don't get too many swaps outstanding for apps that are GPU-heavy
422 * but not CPU-heavy.
423 *
424 * We're using intelDRI2Flush (called from the loader before
425 * swapbuffer) and glFlush (for front buffer rendering) as the
426 * indicator that a frame is done and then throttle when we get
427 * here as we prepare to render the next frame. At this point for
428 * round trips for swap/copy and getting new buffers are done and
429 * we'll spend less time waiting on the GPU.
430 *
431 * Unfortunately, we don't have a handle to the batch containing
432 * the swap, and getting our hands on that doesn't seem worth it,
433 * so we just use the first batch we emitted after the last swap.
434 */
435 if (brw->need_swap_throttle && brw->throttle_batch[0]) {
436 if (brw->throttle_batch[1]) {
437 if (!brw->disable_throttling)
438 drm_bacon_bo_wait_rendering(brw->throttle_batch[1]);
439 drm_bacon_bo_unreference(brw->throttle_batch[1]);
440 }
441 brw->throttle_batch[1] = brw->throttle_batch[0];
442 brw->throttle_batch[0] = NULL;
443 brw->need_swap_throttle = false;
444 /* Throttling here is more precise than the throttle ioctl, so skip it */
445 brw->need_flush_throttle = false;
446 }
447
448 if (brw->need_flush_throttle) {
449 __DRIscreen *dri_screen = brw->screen->driScrnPriv;
450 drmCommandNone(dri_screen->fd, DRM_I915_GEM_THROTTLE);
451 brw->need_flush_throttle = false;
452 }
453 }
454
455 /* TODO: Push this whole function into bufmgr.
456 */
457 static int
458 do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
459 {
460 struct intel_batchbuffer *batch = &brw->batch;
461 int ret = 0;
462
463 if (brw->has_llc) {
464 drm_bacon_bo_unmap(batch->bo);
465 } else {
466 ret = drm_bacon_bo_subdata(batch->bo, 0, 4 * USED_BATCH(*batch), batch->map);
467 if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
468 ret = drm_bacon_bo_subdata(batch->bo,
469 batch->state_batch_offset,
470 batch->bo->size - batch->state_batch_offset,
471 (char *)batch->map + batch->state_batch_offset);
472 }
473 }
474
475 if (!brw->screen->no_hw) {
476 int flags;
477
478 if (brw->gen >= 6 && batch->ring == BLT_RING) {
479 flags = I915_EXEC_BLT;
480 } else {
481 flags = I915_EXEC_RENDER;
482 }
483 if (batch->needs_sol_reset)
484 flags |= I915_EXEC_GEN7_SOL_RESET;
485
486 if (ret == 0) {
487 if (brw->hw_ctx == NULL || batch->ring != RENDER_RING) {
488 assert(in_fence_fd == -1);
489 assert(out_fence_fd == NULL);
490 ret = drm_bacon_bo_mrb_exec(batch->bo, 4 * USED_BATCH(*batch),
491 flags);
492 } else {
493 ret = drm_bacon_gem_bo_fence_exec(batch->bo, brw->hw_ctx,
494 4 * USED_BATCH(*batch),
495 in_fence_fd, out_fence_fd,
496 flags);
497 }
498 }
499
500 throttle(brw);
501 }
502
503 if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
504 do_batch_dump(brw);
505
506 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
507 brw_check_for_reset(brw);
508
509 if (ret != 0) {
510 fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
511 exit(1);
512 }
513
514 return ret;
515 }
516
517 /**
518 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership
519 * of the fd.
520 *
521 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
522 * of the returned fd.
523 */
524 int
525 _intel_batchbuffer_flush_fence(struct brw_context *brw,
526 int in_fence_fd, int *out_fence_fd,
527 const char *file, int line)
528 {
529 int ret;
530
531 if (USED_BATCH(brw->batch) == 0)
532 return 0;
533
534 if (brw->throttle_batch[0] == NULL) {
535 brw->throttle_batch[0] = brw->batch.bo;
536 drm_bacon_bo_reference(brw->throttle_batch[0]);
537 }
538
539 if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
540 int bytes_for_commands = 4 * USED_BATCH(brw->batch);
541 int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
542 int total_bytes = bytes_for_commands + bytes_for_state;
543 fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
544 "%4db (state) = %4db (%0.1f%%)\n", file, line,
545 bytes_for_commands, bytes_for_state,
546 total_bytes,
547 100.0f * total_bytes / BATCH_SZ);
548 }
549
550 brw->batch.reserved_space = 0;
551
552 brw_finish_batch(brw);
553
554 /* Mark the end of the buffer. */
555 intel_batchbuffer_emit_dword(&brw->batch, MI_BATCH_BUFFER_END);
556 if (USED_BATCH(brw->batch) & 1) {
557 /* Round batchbuffer usage to 2 DWORDs. */
558 intel_batchbuffer_emit_dword(&brw->batch, MI_NOOP);
559 }
560
561 intel_upload_finish(brw);
562
563 /* Check that we didn't just wrap our batchbuffer at a bad time. */
564 assert(!brw->no_batch_wrap);
565
566 ret = do_flush_locked(brw, in_fence_fd, out_fence_fd);
567
568 if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
569 fprintf(stderr, "waiting for idle\n");
570 drm_bacon_bo_wait_rendering(brw->batch.bo);
571 }
572
573 /* Start a new batch buffer. */
574 brw_new_batch(brw);
575
576 return ret;
577 }
578
579
580 /* This is the only way buffers get added to the validate list.
581 */
582 uint64_t
583 brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
584 drm_bacon_bo *target, uint32_t target_offset,
585 uint32_t read_domains, uint32_t write_domain)
586 {
587 int ret;
588
589 ret = drm_bacon_bo_emit_reloc(batch->bo, batch_offset,
590 target, target_offset,
591 read_domains, write_domain);
592 assert(ret == 0);
593 (void)ret;
594
595 /* Using the old buffer offset, write in what the right data would be, in
596 * case the buffer doesn't move and we can short-circuit the relocation
597 * processing in the kernel
598 */
599 return target->offset64 + target_offset;
600 }
601
602 void
603 intel_batchbuffer_data(struct brw_context *brw,
604 const void *data, GLuint bytes, enum brw_gpu_ring ring)
605 {
606 assert((bytes & 3) == 0);
607 intel_batchbuffer_require_space(brw, bytes, ring);
608 memcpy(brw->batch.map_next, data, bytes);
609 brw->batch.map_next += bytes >> 2;
610 }
611
612 static void
613 load_sized_register_mem(struct brw_context *brw,
614 uint32_t reg,
615 drm_bacon_bo *bo,
616 uint32_t read_domains, uint32_t write_domain,
617 uint32_t offset,
618 int size)
619 {
620 int i;
621
622 /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
623 assert(brw->gen >= 7);
624
625 if (brw->gen >= 8) {
626 BEGIN_BATCH(4 * size);
627 for (i = 0; i < size; i++) {
628 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
629 OUT_BATCH(reg + i * 4);
630 OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
631 }
632 ADVANCE_BATCH();
633 } else {
634 BEGIN_BATCH(3 * size);
635 for (i = 0; i < size; i++) {
636 OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
637 OUT_BATCH(reg + i * 4);
638 OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
639 }
640 ADVANCE_BATCH();
641 }
642 }
643
644 void
645 brw_load_register_mem(struct brw_context *brw,
646 uint32_t reg,
647 drm_bacon_bo *bo,
648 uint32_t read_domains, uint32_t write_domain,
649 uint32_t offset)
650 {
651 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
652 }
653
654 void
655 brw_load_register_mem64(struct brw_context *brw,
656 uint32_t reg,
657 drm_bacon_bo *bo,
658 uint32_t read_domains, uint32_t write_domain,
659 uint32_t offset)
660 {
661 load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
662 }
663
664 /*
665 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
666 */
667 void
668 brw_store_register_mem32(struct brw_context *brw,
669 drm_bacon_bo *bo, uint32_t reg, uint32_t offset)
670 {
671 assert(brw->gen >= 6);
672
673 if (brw->gen >= 8) {
674 BEGIN_BATCH(4);
675 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
676 OUT_BATCH(reg);
677 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
678 offset);
679 ADVANCE_BATCH();
680 } else {
681 BEGIN_BATCH(3);
682 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
683 OUT_BATCH(reg);
684 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
685 offset);
686 ADVANCE_BATCH();
687 }
688 }
689
690 /*
691 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
692 */
693 void
694 brw_store_register_mem64(struct brw_context *brw,
695 drm_bacon_bo *bo, uint32_t reg, uint32_t offset)
696 {
697 assert(brw->gen >= 6);
698
699 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
700 * read a full 64-bit register, we need to do two of them.
701 */
702 if (brw->gen >= 8) {
703 BEGIN_BATCH(8);
704 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
705 OUT_BATCH(reg);
706 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
707 offset);
708 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
709 OUT_BATCH(reg + sizeof(uint32_t));
710 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
711 offset + sizeof(uint32_t));
712 ADVANCE_BATCH();
713 } else {
714 BEGIN_BATCH(6);
715 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
716 OUT_BATCH(reg);
717 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
718 offset);
719 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
720 OUT_BATCH(reg + sizeof(uint32_t));
721 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
722 offset + sizeof(uint32_t));
723 ADVANCE_BATCH();
724 }
725 }
726
727 /*
728 * Write a 32-bit register using immediate data.
729 */
730 void
731 brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
732 {
733 assert(brw->gen >= 6);
734
735 BEGIN_BATCH(3);
736 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
737 OUT_BATCH(reg);
738 OUT_BATCH(imm);
739 ADVANCE_BATCH();
740 }
741
742 /*
743 * Write a 64-bit register using immediate data.
744 */
745 void
746 brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
747 {
748 assert(brw->gen >= 6);
749
750 BEGIN_BATCH(5);
751 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
752 OUT_BATCH(reg);
753 OUT_BATCH(imm & 0xffffffff);
754 OUT_BATCH(reg + 4);
755 OUT_BATCH(imm >> 32);
756 ADVANCE_BATCH();
757 }
758
759 /*
760 * Copies a 32-bit register.
761 */
762 void
763 brw_load_register_reg(struct brw_context *brw, uint32_t src, uint32_t dest)
764 {
765 assert(brw->gen >= 8 || brw->is_haswell);
766
767 BEGIN_BATCH(3);
768 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
769 OUT_BATCH(src);
770 OUT_BATCH(dest);
771 ADVANCE_BATCH();
772 }
773
774 /*
775 * Copies a 64-bit register.
776 */
777 void
778 brw_load_register_reg64(struct brw_context *brw, uint32_t src, uint32_t dest)
779 {
780 assert(brw->gen >= 8 || brw->is_haswell);
781
782 BEGIN_BATCH(6);
783 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
784 OUT_BATCH(src);
785 OUT_BATCH(dest);
786 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
787 OUT_BATCH(src + sizeof(uint32_t));
788 OUT_BATCH(dest + sizeof(uint32_t));
789 ADVANCE_BATCH();
790 }
791
792 /*
793 * Write 32-bits of immediate data to a GPU memory buffer.
794 */
795 void
796 brw_store_data_imm32(struct brw_context *brw, drm_bacon_bo *bo,
797 uint32_t offset, uint32_t imm)
798 {
799 assert(brw->gen >= 6);
800
801 BEGIN_BATCH(4);
802 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
803 if (brw->gen >= 8)
804 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
805 offset);
806 else {
807 OUT_BATCH(0); /* MBZ */
808 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
809 offset);
810 }
811 OUT_BATCH(imm);
812 ADVANCE_BATCH();
813 }
814
815 /*
816 * Write 64-bits of immediate data to a GPU memory buffer.
817 */
818 void
819 brw_store_data_imm64(struct brw_context *brw, drm_bacon_bo *bo,
820 uint32_t offset, uint64_t imm)
821 {
822 assert(brw->gen >= 6);
823
824 BEGIN_BATCH(5);
825 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
826 if (brw->gen >= 8)
827 OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
828 offset);
829 else {
830 OUT_BATCH(0); /* MBZ */
831 OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
832 offset);
833 }
834 OUT_BATCH(imm & 0xffffffffu);
835 OUT_BATCH(imm >> 32);
836 ADVANCE_BATCH();
837 }