intel/blorp: Add indirect clear color support to mcs_partial_resolve
[mesa.git] / src / intel / blorp / blorp_genX_exec.h
1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef BLORP_GENX_EXEC_H
25 #define BLORP_GENX_EXEC_H
26
27 #include "blorp_priv.h"
28 #include "common/gen_device_info.h"
29 #include "common/gen_sample_positions.h"
30 #include "genxml/gen_macros.h"
31
32 /**
33 * This file provides the blorp pipeline setup and execution functionality.
34 * It defines the following function:
35 *
36 * static void
37 * blorp_exec(struct blorp_context *blorp, void *batch_data,
38 * const struct blorp_params *params);
39 *
40 * It is the job of whoever includes this header to wrap this in something
41 * to get an externally visible symbol.
42 *
43 * In order for the blorp_exec function to work, the driver must provide
44 * implementations of the following static helper functions.
45 */
46
47 static void *
48 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
49
50 static uint64_t
51 blorp_emit_reloc(struct blorp_batch *batch,
52 void *location, struct blorp_address address, uint32_t delta);
53
54 static void *
55 blorp_alloc_dynamic_state(struct blorp_batch *batch,
56 uint32_t size,
57 uint32_t alignment,
58 uint32_t *offset);
59 static void *
60 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
61 struct blorp_address *addr);
62
63 #if GEN_GEN >= 8
64 static struct blorp_address
65 blorp_get_workaround_page(struct blorp_batch *batch);
66 #endif
67
68 static void
69 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
70 unsigned state_size, unsigned state_alignment,
71 uint32_t *bt_offset, uint32_t *surface_offsets,
72 void **surface_maps);
73
74 static void
75 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
76
77 static void
78 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
79 struct blorp_address address, uint32_t delta);
80
81 #if GEN_GEN >= 7
82 static struct blorp_address
83 blorp_get_surface_base_address(struct blorp_batch *batch);
84 #endif
85
86 static void
87 blorp_emit_urb_config(struct blorp_batch *batch,
88 unsigned vs_entry_size, unsigned sf_entry_size);
89
90 static void
91 blorp_emit_pipeline(struct blorp_batch *batch,
92 const struct blorp_params *params);
93
94 /***** BEGIN blorp_exec implementation ******/
95
96 static uint64_t
97 _blorp_combine_address(struct blorp_batch *batch, void *location,
98 struct blorp_address address, uint32_t delta)
99 {
100 if (address.buffer == NULL) {
101 return address.offset + delta;
102 } else {
103 return blorp_emit_reloc(batch, location, address, delta);
104 }
105 }
106
107 #define __gen_address_type struct blorp_address
108 #define __gen_user_data struct blorp_batch
109 #define __gen_combine_address _blorp_combine_address
110
111 #include "genxml/genX_pack.h"
112
113 #define _blorp_cmd_length(cmd) cmd ## _length
114 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
115 #define _blorp_cmd_header(cmd) cmd ## _header
116 #define _blorp_cmd_pack(cmd) cmd ## _pack
117
118 #define blorp_emit(batch, cmd, name) \
119 for (struct cmd name = { _blorp_cmd_header(cmd) }, \
120 *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
121 __builtin_expect(_dst != NULL, 1); \
122 _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
123 _dst = NULL)
124
125 #define blorp_emitn(batch, cmd, n) ({ \
126 uint32_t *_dw = blorp_emit_dwords(batch, n); \
127 if (_dw) { \
128 struct cmd template = { \
129 _blorp_cmd_header(cmd), \
130 .DWordLength = n - _blorp_cmd_length_bias(cmd), \
131 }; \
132 _blorp_cmd_pack(cmd)(batch, _dw, &template); \
133 } \
134 _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */ \
135 })
136
137 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
138
139 #define blorp_emit_dynamic(batch, state, name, align, offset) \
140 for (struct state name = STRUCT_ZERO(state), \
141 *_dst = blorp_alloc_dynamic_state(batch, \
142 _blorp_cmd_length(state) * 4, \
143 align, offset); \
144 __builtin_expect(_dst != NULL, 1); \
145 _blorp_cmd_pack(state)(batch, (void *)_dst, &name), \
146 blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4), \
147 _dst = NULL)
148
149 /* 3DSTATE_URB
150 * 3DSTATE_URB_VS
151 * 3DSTATE_URB_HS
152 * 3DSTATE_URB_DS
153 * 3DSTATE_URB_GS
154 *
155 * Assign the entire URB to the VS. Even though the VS disabled, URB space
156 * is still needed because the clipper loads the VUE's from the URB. From
157 * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
158 * Dword 1.15:0 "VS Number of URB Entries":
159 * This field is always used (even if VS Function Enable is DISABLED).
160 *
161 * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
162 * safely ignore it because this batch contains only one draw call.
163 * Because of URB corruption caused by allocating a previous GS unit
164 * URB entry to the VS unit, software is required to send a “GS NULL
165 * Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
166 * plus a dummy DRAW call before any case where VS will be taking over
167 * GS URB space.
168 *
169 * If the 3DSTATE_URB_VS is emitted, than the others must be also.
170 * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
171 *
172 * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
173 * programmed in order for the programming of this state to be
174 * valid.
175 */
176 static void
177 emit_urb_config(struct blorp_batch *batch,
178 const struct blorp_params *params)
179 {
180 /* Once vertex fetcher has written full VUE entries with complete
181 * header the space requirement is as follows per vertex (in bytes):
182 *
183 * Header Position Program constants
184 * +--------+------------+-------------------+
185 * | 16 | 16 | n x 16 |
186 * +--------+------------+-------------------+
187 *
188 * where 'n' stands for number of varying inputs expressed as vec4s.
189 */
190 const unsigned num_varyings =
191 params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
192 const unsigned total_needed = 16 + 16 + num_varyings * 16;
193
194 /* The URB size is expressed in units of 64 bytes (512 bits) */
195 const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
196
197 const unsigned sf_entry_size =
198 params->sf_prog_data ? params->sf_prog_data->urb_entry_size : 0;
199
200 blorp_emit_urb_config(batch, vs_entry_size, sf_entry_size);
201 }
202
203 static void
204 blorp_emit_vertex_data(struct blorp_batch *batch,
205 const struct blorp_params *params,
206 struct blorp_address *addr,
207 uint32_t *size)
208 {
209 const float vertices[] = {
210 /* v0 */ (float)params->x1, (float)params->y1, params->z,
211 /* v1 */ (float)params->x0, (float)params->y1, params->z,
212 /* v2 */ (float)params->x0, (float)params->y0, params->z,
213 };
214
215 void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
216 memcpy(data, vertices, sizeof(vertices));
217 *size = sizeof(vertices);
218 blorp_flush_range(batch, data, *size);
219 }
220
221 static void
222 blorp_emit_input_varying_data(struct blorp_batch *batch,
223 const struct blorp_params *params,
224 struct blorp_address *addr,
225 uint32_t *size)
226 {
227 const unsigned vec4_size_in_bytes = 4 * sizeof(float);
228 const unsigned max_num_varyings =
229 DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
230 const unsigned num_varyings =
231 params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
232
233 *size = 16 + num_varyings * vec4_size_in_bytes;
234
235 const uint32_t *const inputs_src = (const uint32_t *)&params->wm_inputs;
236 void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
237 uint32_t *inputs = data;
238
239 /* Copy in the VS inputs */
240 assert(sizeof(params->vs_inputs) == 16);
241 memcpy(inputs, &params->vs_inputs, sizeof(params->vs_inputs));
242 inputs += 4;
243
244 if (params->wm_prog_data) {
245 /* Walk over the attribute slots, determine if the attribute is used by
246 * the program and when necessary copy the values from the input storage
247 * to the vertex data buffer.
248 */
249 for (unsigned i = 0; i < max_num_varyings; i++) {
250 const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
251
252 const int input_index = params->wm_prog_data->urb_setup[attr];
253 if (input_index < 0)
254 continue;
255
256 memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
257
258 inputs += 4;
259 }
260 }
261
262 blorp_flush_range(batch, data, *size);
263 }
264
265 static void
266 blorp_fill_vertex_buffer_state(struct blorp_batch *batch,
267 struct GENX(VERTEX_BUFFER_STATE) *vb,
268 unsigned idx,
269 struct blorp_address addr, uint32_t size,
270 uint32_t stride)
271 {
272 vb[idx].VertexBufferIndex = idx;
273 vb[idx].BufferStartingAddress = addr;
274 vb[idx].BufferPitch = stride;
275
276 #if GEN_GEN >= 6
277 vb[idx].VertexBufferMOCS = addr.mocs;
278 #endif
279
280 #if GEN_GEN >= 7
281 vb[idx].AddressModifyEnable = true;
282 #endif
283
284 #if GEN_GEN >= 8
285 vb[idx].BufferSize = size;
286 #elif GEN_GEN >= 5
287 vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
288 vb[idx].EndAddress = vb[idx].BufferStartingAddress;
289 vb[idx].EndAddress.offset += size - 1;
290 #elif GEN_GEN == 4
291 vb[idx].BufferAccessType = stride > 0 ? VERTEXDATA : INSTANCEDATA;
292 vb[idx].MaxIndex = stride > 0 ? size / stride : 0;
293 #endif
294 }
295
296 static void
297 blorp_emit_vertex_buffers(struct blorp_batch *batch,
298 const struct blorp_params *params)
299 {
300 struct GENX(VERTEX_BUFFER_STATE) vb[3];
301 memset(vb, 0, sizeof(vb));
302
303 struct blorp_address addr;
304 uint32_t size;
305 blorp_emit_vertex_data(batch, params, &addr, &size);
306 blorp_fill_vertex_buffer_state(batch, vb, 0, addr, size, 3 * sizeof(float));
307
308 blorp_emit_input_varying_data(batch, params, &addr, &size);
309 blorp_fill_vertex_buffer_state(batch, vb, 1, addr, size, 0);
310
311 uint32_t num_vbs = 2;
312 if (params->dst_clear_color_as_input) {
313 blorp_fill_vertex_buffer_state(batch, vb, num_vbs++,
314 params->dst.clear_color_addr,
315 batch->blorp->isl_dev->ss.clear_value_size,
316 0);
317 }
318
319 const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
320 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
321 if (!dw)
322 return;
323
324 for (unsigned i = 0; i < num_vbs; i++) {
325 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
326 dw += GENX(VERTEX_BUFFER_STATE_length);
327 }
328 }
329
330 static void
331 blorp_emit_vertex_elements(struct blorp_batch *batch,
332 const struct blorp_params *params)
333 {
334 const unsigned num_varyings =
335 params->wm_prog_data ? params->wm_prog_data->num_varying_inputs : 0;
336 bool need_ndc = batch->blorp->compiler->devinfo->gen <= 5;
337 const unsigned num_elements = 2 + need_ndc + num_varyings;
338
339 struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
340 memset(ve, 0, num_elements * sizeof(*ve));
341
342 /* Setup VBO for the rectangle primitive..
343 *
344 * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
345 * vertices. The vertices reside in screen space with DirectX
346 * coordinates (that is, (0, 0) is the upper left corner).
347 *
348 * v2 ------ implied
349 * | |
350 * | |
351 * v1 ----- v0
352 *
353 * Since the VS is disabled, the clipper loads each VUE directly from
354 * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
355 * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
356 * dw0: Reserved, MBZ.
357 * dw1: Render Target Array Index. Below vertex fetcher gets programmed
358 * to assign this with primitive instance identifier which will be
359 * used for layered clears. All other renders have only one instance
360 * and therefore the value will be effectively zero.
361 * dw2: Viewport Index. The HiZ op disables viewport mapping and
362 * scissoring, so set the dword to 0.
363 * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
364 * so set the dword to 0.
365 * dw4: Vertex Position X.
366 * dw5: Vertex Position Y.
367 * dw6: Vertex Position Z.
368 * dw7: Vertex Position W.
369 *
370 * dw8: Flat vertex input 0
371 * dw9: Flat vertex input 1
372 * ...
373 * dwn: Flat vertex input n - 8
374 *
375 * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
376 * "Vertex URB Entry (VUE) Formats".
377 *
378 * Only vertex position X and Y are going to be variable, Z is fixed to
379 * zero and W to one. Header words dw0,2,3 are zero. There is no need to
380 * include the fixed values in the vertex buffer. Vertex fetcher can be
381 * instructed to fill vertex elements with constant values of one and zero
382 * instead of reading them from the buffer.
383 * Flat inputs are program constants that are not interpolated. Moreover
384 * their values will be the same between vertices.
385 *
386 * See the vertex element setup below.
387 */
388 unsigned slot = 0;
389
390 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
391 .VertexBufferIndex = 1,
392 .Valid = true,
393 .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
394 .SourceElementOffset = 0,
395 .Component0Control = VFCOMP_STORE_SRC,
396
397 /* From Gen8 onwards hardware is no more instructed to overwrite
398 * components using an element specifier. Instead one has separate
399 * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
400 */
401 #if GEN_GEN >= 8
402 .Component1Control = VFCOMP_STORE_0,
403 #elif GEN_GEN >= 5
404 .Component1Control = VFCOMP_STORE_IID,
405 #else
406 .Component1Control = VFCOMP_STORE_0,
407 #endif
408 .Component2Control = VFCOMP_STORE_0,
409 .Component3Control = VFCOMP_STORE_0,
410 #if GEN_GEN <= 5
411 .DestinationElementOffset = slot * 4,
412 #endif
413 };
414 slot++;
415
416 #if GEN_GEN <= 5
417 /* On Iron Lake and earlier, a native device coordinates version of the
418 * position goes right after the normal VUE header and before position.
419 * Since w == 1 for all of our coordinates, this is just a copy of the
420 * position.
421 */
422 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
423 .VertexBufferIndex = 0,
424 .Valid = true,
425 .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32_FLOAT,
426 .SourceElementOffset = 0,
427 .Component0Control = VFCOMP_STORE_SRC,
428 .Component1Control = VFCOMP_STORE_SRC,
429 .Component2Control = VFCOMP_STORE_SRC,
430 .Component3Control = VFCOMP_STORE_1_FP,
431 .DestinationElementOffset = slot * 4,
432 };
433 slot++;
434 #endif
435
436 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
437 .VertexBufferIndex = 0,
438 .Valid = true,
439 .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32_FLOAT,
440 .SourceElementOffset = 0,
441 .Component0Control = VFCOMP_STORE_SRC,
442 .Component1Control = VFCOMP_STORE_SRC,
443 .Component2Control = VFCOMP_STORE_SRC,
444 .Component3Control = VFCOMP_STORE_1_FP,
445 #if GEN_GEN <= 5
446 .DestinationElementOffset = slot * 4,
447 #endif
448 };
449 slot++;
450
451 if (params->dst_clear_color_as_input) {
452 /* If the caller wants the destination indirect clear color, redirect
453 * to vertex buffer 2 where we stored it earlier. The only users of
454 * an indirect clear color source have that as their only vertex
455 * attribute.
456 */
457 assert(num_varyings == 1);
458 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
459 .VertexBufferIndex = 2,
460 .Valid = true,
461 .SourceElementOffset = 0,
462 .Component0Control = VFCOMP_STORE_SRC,
463 #if GEN_GEN >= 9
464 .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
465 .Component1Control = VFCOMP_STORE_SRC,
466 .Component2Control = VFCOMP_STORE_SRC,
467 .Component3Control = VFCOMP_STORE_SRC,
468 #else
469 /* Clear colors on gen7-8 are for bits out of one dword */
470 .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32_FLOAT,
471 .Component1Control = VFCOMP_STORE_0,
472 .Component2Control = VFCOMP_STORE_0,
473 .Component3Control = VFCOMP_STORE_0,
474 #endif
475 };
476 slot++;
477 } else {
478 for (unsigned i = 0; i < num_varyings; ++i) {
479 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
480 .VertexBufferIndex = 1,
481 .Valid = true,
482 .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
483 .SourceElementOffset = 16 + i * 4 * sizeof(float),
484 .Component0Control = VFCOMP_STORE_SRC,
485 .Component1Control = VFCOMP_STORE_SRC,
486 .Component2Control = VFCOMP_STORE_SRC,
487 .Component3Control = VFCOMP_STORE_SRC,
488 #if GEN_GEN <= 5
489 .DestinationElementOffset = slot * 4,
490 #endif
491 };
492 slot++;
493 }
494 }
495
496 const unsigned num_dwords =
497 1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
498 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
499 if (!dw)
500 return;
501
502 for (unsigned i = 0; i < num_elements; i++) {
503 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
504 dw += GENX(VERTEX_ELEMENT_STATE_length);
505 }
506
507 #if GEN_GEN >= 8
508 /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
509 * primitive instance identifier. This is used for layered clears.
510 */
511 blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
512 sgvs.InstanceIDEnable = true;
513 sgvs.InstanceIDComponentNumber = COMP_1;
514 sgvs.InstanceIDElementOffset = 0;
515 }
516
517 for (unsigned i = 0; i < num_elements; i++) {
518 blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
519 vf.VertexElementIndex = i;
520 vf.InstancingEnable = false;
521 }
522 }
523
524 blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
525 topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
526 }
527 #endif
528 }
529
530 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
531 static uint32_t
532 blorp_emit_cc_viewport(struct blorp_batch *batch,
533 const struct blorp_params *params)
534 {
535 uint32_t cc_vp_offset;
536 blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
537 vp.MinimumDepth = 0.0;
538 vp.MaximumDepth = 1.0;
539 }
540
541 #if GEN_GEN >= 7
542 blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
543 vsp.CCViewportPointer = cc_vp_offset;
544 }
545 #elif GEN_GEN == 6
546 blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vsp) {
547 vsp.CCViewportStateChange = true;
548 vsp.PointertoCC_VIEWPORT = cc_vp_offset;
549 }
550 #endif
551
552 return cc_vp_offset;
553 }
554
555 static uint32_t
556 blorp_emit_sampler_state(struct blorp_batch *batch,
557 const struct blorp_params *params)
558 {
559 uint32_t offset;
560 blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
561 sampler.MipModeFilter = MIPFILTER_NONE;
562 sampler.MagModeFilter = MAPFILTER_LINEAR;
563 sampler.MinModeFilter = MAPFILTER_LINEAR;
564 sampler.MinLOD = 0;
565 sampler.MaxLOD = 0;
566 sampler.TCXAddressControlMode = TCM_CLAMP;
567 sampler.TCYAddressControlMode = TCM_CLAMP;
568 sampler.TCZAddressControlMode = TCM_CLAMP;
569 sampler.MaximumAnisotropy = RATIO21;
570 sampler.RAddressMinFilterRoundingEnable = true;
571 sampler.RAddressMagFilterRoundingEnable = true;
572 sampler.VAddressMinFilterRoundingEnable = true;
573 sampler.VAddressMagFilterRoundingEnable = true;
574 sampler.UAddressMinFilterRoundingEnable = true;
575 sampler.UAddressMagFilterRoundingEnable = true;
576 #if GEN_GEN > 6
577 sampler.NonnormalizedCoordinateEnable = true;
578 #endif
579 }
580
581 #if GEN_GEN >= 7
582 blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
583 ssp.PointertoPSSamplerState = offset;
584 }
585 #elif GEN_GEN == 6
586 blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ssp) {
587 ssp.VSSamplerStateChange = true;
588 ssp.GSSamplerStateChange = true;
589 ssp.PSSamplerStateChange = true;
590 ssp.PointertoPSSamplerState = offset;
591 }
592 #endif
593
594 return offset;
595 }
596
597 /* What follows is the code for setting up a "pipeline" on Sandy Bridge and
598 * later hardware. This file will be included by i965 for gen4-5 as well, so
599 * this code is guarded by GEN_GEN >= 6.
600 */
601 #if GEN_GEN >= 6
602
603 static void
604 blorp_emit_vs_config(struct blorp_batch *batch,
605 const struct blorp_params *params)
606 {
607 struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
608 assert(!vs_prog_data || GEN_GEN < 11 ||
609 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
610
611 blorp_emit(batch, GENX(3DSTATE_VS), vs) {
612 if (vs_prog_data) {
613 vs.Enable = true;
614
615 vs.KernelStartPointer = params->vs_prog_kernel;
616
617 vs.DispatchGRFStartRegisterForURBData =
618 vs_prog_data->base.base.dispatch_grf_start_reg;
619 vs.VertexURBEntryReadLength =
620 vs_prog_data->base.urb_read_length;
621 vs.VertexURBEntryReadOffset = 0;
622
623 vs.MaximumNumberofThreads =
624 batch->blorp->isl_dev->info->max_vs_threads - 1;
625
626 #if GEN_GEN >= 8
627 vs.SIMD8DispatchEnable =
628 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
629 #endif
630 }
631 }
632 }
633
634 static void
635 blorp_emit_sf_config(struct blorp_batch *batch,
636 const struct blorp_params *params)
637 {
638 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
639
640 /* 3DSTATE_SF
641 *
642 * Disable ViewportTransformEnable (dw2.1)
643 *
644 * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
645 * Primitives Overview":
646 * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
647 * use of screen- space coordinates).
648 *
649 * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
650 * and BackFaceFillMode (dw2.5:6) to SOLID(0).
651 *
652 * From the Sandy Bridge PRM, Volume 2, Part 1, Section
653 * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
654 * SOLID: Any triangle or rectangle object found to be front-facing
655 * is rendered as a solid object. This setting is required when
656 * (rendering rectangle (RECTLIST) objects.
657 */
658
659 #if GEN_GEN >= 8
660
661 blorp_emit(batch, GENX(3DSTATE_SF), sf);
662
663 blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
664 raster.CullMode = CULLMODE_NONE;
665 }
666
667 blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
668 sbe.VertexURBEntryReadOffset = 1;
669 if (prog_data) {
670 sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
671 sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
672 sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
673 } else {
674 sbe.NumberofSFOutputAttributes = 0;
675 sbe.VertexURBEntryReadLength = 1;
676 }
677 sbe.ForceVertexURBEntryReadLength = true;
678 sbe.ForceVertexURBEntryReadOffset = true;
679
680 #if GEN_GEN >= 9
681 for (unsigned i = 0; i < 32; i++)
682 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
683 #endif
684 }
685
686 #elif GEN_GEN >= 7
687
688 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
689 sf.FrontFaceFillMode = FILL_MODE_SOLID;
690 sf.BackFaceFillMode = FILL_MODE_SOLID;
691
692 sf.MultisampleRasterizationMode = params->num_samples > 1 ?
693 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
694
695 #if GEN_GEN == 7
696 sf.DepthBufferSurfaceFormat = params->depth_format;
697 #endif
698 }
699
700 blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
701 sbe.VertexURBEntryReadOffset = 1;
702 if (prog_data) {
703 sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
704 sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
705 sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
706 } else {
707 sbe.NumberofSFOutputAttributes = 0;
708 sbe.VertexURBEntryReadLength = 1;
709 }
710 }
711
712 #else /* GEN_GEN <= 6 */
713
714 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
715 sf.FrontFaceFillMode = FILL_MODE_SOLID;
716 sf.BackFaceFillMode = FILL_MODE_SOLID;
717
718 sf.MultisampleRasterizationMode = params->num_samples > 1 ?
719 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
720
721 sf.VertexURBEntryReadOffset = 1;
722 if (prog_data) {
723 sf.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
724 sf.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
725 sf.ConstantInterpolationEnable = prog_data->flat_inputs;
726 } else {
727 sf.NumberofSFOutputAttributes = 0;
728 sf.VertexURBEntryReadLength = 1;
729 }
730 }
731
732 #endif /* GEN_GEN */
733 }
734
735 static void
736 blorp_emit_ps_config(struct blorp_batch *batch,
737 const struct blorp_params *params)
738 {
739 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
740
741 /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
742 * nonzero to prevent the GPU from hanging. While the documentation doesn't
743 * mention this explicitly, it notes that the valid range for the field is
744 * [1,39] = [2,40] threads, which excludes zero.
745 *
746 * To be safe (and to minimize extraneous code) we go ahead and fully
747 * configure the WM state whether or not there is a WM program.
748 */
749
750 #if GEN_GEN >= 8
751
752 blorp_emit(batch, GENX(3DSTATE_WM), wm);
753
754 blorp_emit(batch, GENX(3DSTATE_PS), ps) {
755 if (params->src.enabled) {
756 ps.SamplerCount = 1; /* Up to 4 samplers */
757 ps.BindingTableEntryCount = 2;
758 } else {
759 ps.BindingTableEntryCount = 1;
760 }
761
762 if (prog_data) {
763 ps.DispatchGRFStartRegisterForConstantSetupData0 =
764 prog_data->base.dispatch_grf_start_reg;
765 ps.DispatchGRFStartRegisterForConstantSetupData2 =
766 prog_data->dispatch_grf_start_reg_2;
767
768 ps._8PixelDispatchEnable = prog_data->dispatch_8;
769 ps._16PixelDispatchEnable = prog_data->dispatch_16;
770
771 ps.KernelStartPointer0 = params->wm_prog_kernel;
772 ps.KernelStartPointer2 =
773 params->wm_prog_kernel + prog_data->prog_offset_2;
774 }
775
776 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
777 * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
778 * k, it implies 2(k+1) threads. It implicitly scales for different GT
779 * levels (which have some # of PSDs).
780 *
781 * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
782 */
783 if (GEN_GEN >= 9)
784 ps.MaximumNumberofThreadsPerPSD = 64 - 1;
785 else
786 ps.MaximumNumberofThreadsPerPSD = 64 - 2;
787
788 switch (params->fast_clear_op) {
789 case ISL_AUX_OP_NONE:
790 break;
791 #if GEN_GEN >= 9
792 case ISL_AUX_OP_PARTIAL_RESOLVE:
793 ps.RenderTargetResolveType = RESOLVE_PARTIAL;
794 break;
795 case ISL_AUX_OP_FULL_RESOLVE:
796 ps.RenderTargetResolveType = RESOLVE_FULL;
797 break;
798 #else
799 case ISL_AUX_OP_FULL_RESOLVE:
800 ps.RenderTargetResolveEnable = true;
801 break;
802 #endif
803 case ISL_AUX_OP_FAST_CLEAR:
804 ps.RenderTargetFastClearEnable = true;
805 break;
806 default:
807 unreachable("Invalid fast clear op");
808 }
809 }
810
811 blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
812 if (prog_data) {
813 psx.PixelShaderValid = true;
814 psx.AttributeEnable = prog_data->num_varying_inputs > 0;
815 psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
816 }
817
818 if (params->src.enabled)
819 psx.PixelShaderKillsPixel = true;
820 }
821
822 #elif GEN_GEN >= 7
823
824 blorp_emit(batch, GENX(3DSTATE_WM), wm) {
825 switch (params->hiz_op) {
826 case ISL_AUX_OP_FAST_CLEAR:
827 wm.DepthBufferClear = true;
828 break;
829 case ISL_AUX_OP_FULL_RESOLVE:
830 wm.DepthBufferResolveEnable = true;
831 break;
832 case ISL_AUX_OP_AMBIGUATE:
833 wm.HierarchicalDepthBufferResolveEnable = true;
834 break;
835 case ISL_AUX_OP_NONE:
836 break;
837 default:
838 unreachable("not reached");
839 }
840
841 if (prog_data)
842 wm.ThreadDispatchEnable = true;
843
844 if (params->src.enabled)
845 wm.PixelShaderKillsPixel = true;
846
847 if (params->num_samples > 1) {
848 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
849 wm.MultisampleDispatchMode =
850 (prog_data && prog_data->persample_dispatch) ?
851 MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
852 } else {
853 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
854 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
855 }
856 }
857
858 blorp_emit(batch, GENX(3DSTATE_PS), ps) {
859 ps.MaximumNumberofThreads =
860 batch->blorp->isl_dev->info->max_wm_threads - 1;
861
862 #if GEN_IS_HASWELL
863 ps.SampleMask = 1;
864 #endif
865
866 if (prog_data) {
867 ps.DispatchGRFStartRegisterForConstantSetupData0 =
868 prog_data->base.dispatch_grf_start_reg;
869 ps.DispatchGRFStartRegisterForConstantSetupData2 =
870 prog_data->dispatch_grf_start_reg_2;
871
872 ps.KernelStartPointer0 = params->wm_prog_kernel;
873 ps.KernelStartPointer2 =
874 params->wm_prog_kernel + prog_data->prog_offset_2;
875
876 ps._8PixelDispatchEnable = prog_data->dispatch_8;
877 ps._16PixelDispatchEnable = prog_data->dispatch_16;
878
879 ps.AttributeEnable = prog_data->num_varying_inputs > 0;
880 } else {
881 /* Gen7 hardware gets angry if we don't enable at least one dispatch
882 * mode, so just enable 16-pixel dispatch if we don't have a program.
883 */
884 ps._16PixelDispatchEnable = true;
885 }
886
887 if (params->src.enabled)
888 ps.SamplerCount = 1; /* Up to 4 samplers */
889
890 switch (params->fast_clear_op) {
891 case ISL_AUX_OP_NONE:
892 break;
893 case ISL_AUX_OP_FULL_RESOLVE:
894 ps.RenderTargetResolveEnable = true;
895 break;
896 case ISL_AUX_OP_FAST_CLEAR:
897 ps.RenderTargetFastClearEnable = true;
898 break;
899 default:
900 unreachable("Invalid fast clear op");
901 }
902 }
903
904 #else /* GEN_GEN <= 6 */
905
906 blorp_emit(batch, GENX(3DSTATE_WM), wm) {
907 wm.MaximumNumberofThreads =
908 batch->blorp->isl_dev->info->max_wm_threads - 1;
909
910 switch (params->hiz_op) {
911 case ISL_AUX_OP_FAST_CLEAR:
912 wm.DepthBufferClear = true;
913 break;
914 case ISL_AUX_OP_FULL_RESOLVE:
915 wm.DepthBufferResolveEnable = true;
916 break;
917 case ISL_AUX_OP_AMBIGUATE:
918 wm.HierarchicalDepthBufferResolveEnable = true;
919 break;
920 case ISL_AUX_OP_NONE:
921 break;
922 default:
923 unreachable("not reached");
924 }
925
926 if (prog_data) {
927 wm.ThreadDispatchEnable = true;
928
929 wm.DispatchGRFStartRegisterForConstantSetupData0 =
930 prog_data->base.dispatch_grf_start_reg;
931 wm.DispatchGRFStartRegisterForConstantSetupData2 =
932 prog_data->dispatch_grf_start_reg_2;
933
934 wm.KernelStartPointer0 = params->wm_prog_kernel;
935 wm.KernelStartPointer2 =
936 params->wm_prog_kernel + prog_data->prog_offset_2;
937
938 wm._8PixelDispatchEnable = prog_data->dispatch_8;
939 wm._16PixelDispatchEnable = prog_data->dispatch_16;
940
941 wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
942 }
943
944 if (params->src.enabled) {
945 wm.SamplerCount = 1; /* Up to 4 samplers */
946 wm.PixelShaderKillsPixel = true; /* TODO: temporarily smash on */
947 }
948
949 if (params->num_samples > 1) {
950 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
951 wm.MultisampleDispatchMode =
952 (prog_data && prog_data->persample_dispatch) ?
953 MSDISPMODE_PERSAMPLE : MSDISPMODE_PERPIXEL;
954 } else {
955 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
956 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
957 }
958 }
959
960 #endif /* GEN_GEN */
961 }
962
963 static uint32_t
964 blorp_emit_blend_state(struct blorp_batch *batch,
965 const struct blorp_params *params)
966 {
967 struct GENX(BLEND_STATE) blend;
968 memset(&blend, 0, sizeof(blend));
969
970 uint32_t offset;
971 int size = GENX(BLEND_STATE_length) * 4;
972 size += GENX(BLEND_STATE_ENTRY_length) * 4 * params->num_draw_buffers;
973 uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
974 uint32_t *pos = state;
975
976 GENX(BLEND_STATE_pack)(NULL, pos, &blend);
977 pos += GENX(BLEND_STATE_length);
978
979 for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
980 struct GENX(BLEND_STATE_ENTRY) entry = {
981 .PreBlendColorClampEnable = true,
982 .PostBlendColorClampEnable = true,
983 .ColorClampRange = COLORCLAMP_RTFORMAT,
984
985 .WriteDisableRed = params->color_write_disable[0],
986 .WriteDisableGreen = params->color_write_disable[1],
987 .WriteDisableBlue = params->color_write_disable[2],
988 .WriteDisableAlpha = params->color_write_disable[3],
989 };
990 GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
991 pos += GENX(BLEND_STATE_ENTRY_length);
992 }
993
994 blorp_flush_range(batch, state, size);
995
996 #if GEN_GEN >= 7
997 blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
998 sp.BlendStatePointer = offset;
999 #if GEN_GEN >= 8
1000 sp.BlendStatePointerValid = true;
1001 #endif
1002 }
1003 #endif
1004
1005 #if GEN_GEN >= 8
1006 blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
1007 ps_blend.HasWriteableRT = true;
1008 }
1009 #endif
1010
1011 return offset;
1012 }
1013
1014 static uint32_t
1015 blorp_emit_color_calc_state(struct blorp_batch *batch,
1016 const struct blorp_params *params)
1017 {
1018 uint32_t offset;
1019 blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
1020 #if GEN_GEN <= 8
1021 cc.StencilReferenceValue = params->stencil_ref;
1022 #endif
1023 }
1024
1025 #if GEN_GEN >= 7
1026 blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
1027 sp.ColorCalcStatePointer = offset;
1028 #if GEN_GEN >= 8
1029 sp.ColorCalcStatePointerValid = true;
1030 #endif
1031 }
1032 #endif
1033
1034 return offset;
1035 }
1036
1037 static uint32_t
1038 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
1039 const struct blorp_params *params)
1040 {
1041 #if GEN_GEN >= 8
1042 struct GENX(3DSTATE_WM_DEPTH_STENCIL) ds = {
1043 GENX(3DSTATE_WM_DEPTH_STENCIL_header),
1044 };
1045 #else
1046 struct GENX(DEPTH_STENCIL_STATE) ds = { 0 };
1047 #endif
1048
1049 if (params->depth.enabled) {
1050 ds.DepthBufferWriteEnable = true;
1051
1052 switch (params->hiz_op) {
1053 case ISL_AUX_OP_NONE:
1054 ds.DepthTestEnable = true;
1055 ds.DepthTestFunction = COMPAREFUNCTION_ALWAYS;
1056 break;
1057
1058 /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
1059 * - 7.5.3.1 Depth Buffer Clear
1060 * - 7.5.3.2 Depth Buffer Resolve
1061 * - 7.5.3.3 Hierarchical Depth Buffer Resolve
1062 */
1063 case ISL_AUX_OP_FULL_RESOLVE:
1064 ds.DepthTestEnable = true;
1065 ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1066 break;
1067
1068 case ISL_AUX_OP_FAST_CLEAR:
1069 case ISL_AUX_OP_AMBIGUATE:
1070 ds.DepthTestEnable = false;
1071 break;
1072 case ISL_AUX_OP_PARTIAL_RESOLVE:
1073 unreachable("Invalid HIZ op");
1074 }
1075 }
1076
1077 if (params->stencil.enabled) {
1078 ds.StencilBufferWriteEnable = true;
1079 ds.StencilTestEnable = true;
1080 ds.DoubleSidedStencilEnable = false;
1081
1082 ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1083 ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1084
1085 ds.StencilWriteMask = params->stencil_mask;
1086 #if GEN_GEN >= 9
1087 ds.StencilReferenceValue = params->stencil_ref;
1088 #endif
1089 }
1090
1091 #if GEN_GEN >= 8
1092 uint32_t offset = 0;
1093 uint32_t *dw = blorp_emit_dwords(batch,
1094 GENX(3DSTATE_WM_DEPTH_STENCIL_length));
1095 if (!dw)
1096 return 0;
1097
1098 GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, dw, &ds);
1099 #else
1100 uint32_t offset;
1101 void *state = blorp_alloc_dynamic_state(batch,
1102 GENX(DEPTH_STENCIL_STATE_length) * 4,
1103 64, &offset);
1104 GENX(DEPTH_STENCIL_STATE_pack)(NULL, state, &ds);
1105 blorp_flush_range(batch, state, GENX(DEPTH_STENCIL_STATE_length) * 4);
1106 #endif
1107
1108 #if GEN_GEN == 7
1109 blorp_emit(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), sp) {
1110 sp.PointertoDEPTH_STENCIL_STATE = offset;
1111 }
1112 #endif
1113
1114 return offset;
1115 }
1116
1117 static void
1118 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1119 const struct blorp_params *params)
1120 {
1121 blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1122 ms.NumberofMultisamples = __builtin_ffs(params->num_samples) - 1;
1123
1124 #if GEN_GEN >= 8
1125 /* The PRM says that this bit is valid only for DX9:
1126 *
1127 * SW can choose to set this bit only for DX9 API. DX10/OGL API's
1128 * should not have any effect by setting or not setting this bit.
1129 */
1130 ms.PixelPositionOffsetEnable = false;
1131 #elif GEN_GEN >= 7
1132
1133 switch (params->num_samples) {
1134 case 1:
1135 GEN_SAMPLE_POS_1X(ms.Sample);
1136 break;
1137 case 2:
1138 GEN_SAMPLE_POS_2X(ms.Sample);
1139 break;
1140 case 4:
1141 GEN_SAMPLE_POS_4X(ms.Sample);
1142 break;
1143 case 8:
1144 GEN_SAMPLE_POS_8X(ms.Sample);
1145 break;
1146 default:
1147 break;
1148 }
1149 #else
1150 GEN_SAMPLE_POS_4X(ms.Sample);
1151 #endif
1152 ms.PixelLocation = CENTER;
1153 }
1154 }
1155
1156 static void
1157 blorp_emit_pipeline(struct blorp_batch *batch,
1158 const struct blorp_params *params)
1159 {
1160 uint32_t blend_state_offset = 0;
1161 uint32_t color_calc_state_offset;
1162 uint32_t depth_stencil_state_offset;
1163
1164 emit_urb_config(batch, params);
1165
1166 if (params->wm_prog_data) {
1167 blend_state_offset = blorp_emit_blend_state(batch, params);
1168 }
1169 color_calc_state_offset = blorp_emit_color_calc_state(batch, params);
1170 depth_stencil_state_offset = blorp_emit_depth_stencil_state(batch, params);
1171
1172 #if GEN_GEN == 6
1173 /* 3DSTATE_CC_STATE_POINTERS
1174 *
1175 * The pointer offsets are relative to
1176 * CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress.
1177 *
1178 * The HiZ op doesn't use BLEND_STATE or COLOR_CALC_STATE.
1179 *
1180 * The dynamic state emit helpers emit their own STATE_POINTERS packets on
1181 * gen7+. However, on gen6 and earlier, they're all lumpped together in
1182 * one CC_STATE_POINTERS packet so we have to emit that here.
1183 */
1184 blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), cc) {
1185 cc.BLEND_STATEChange = true;
1186 cc.ColorCalcStatePointerValid = true;
1187 cc.DEPTH_STENCIL_STATEChange = true;
1188 cc.PointertoBLEND_STATE = blend_state_offset;
1189 cc.ColorCalcStatePointer = color_calc_state_offset;
1190 cc.PointertoDEPTH_STENCIL_STATE = depth_stencil_state_offset;
1191 }
1192 #else
1193 (void)blend_state_offset;
1194 (void)color_calc_state_offset;
1195 (void)depth_stencil_state_offset;
1196 #endif
1197
1198 blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), vs);
1199 #if GEN_GEN >= 7
1200 blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), hs);
1201 blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), DS);
1202 #endif
1203 blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), gs);
1204 blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), ps);
1205
1206 if (params->src.enabled)
1207 blorp_emit_sampler_state(batch, params);
1208
1209 blorp_emit_3dstate_multisample(batch, params);
1210
1211 blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1212 mask.SampleMask = (1 << params->num_samples) - 1;
1213 }
1214
1215 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1216 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1217 *
1218 * [DevSNB] A pipeline flush must be programmed prior to a
1219 * 3DSTATE_VS command that causes the VS Function Enable to
1220 * toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1221 * command with CS stall bit set and a post sync operation.
1222 *
1223 * We've already done one at the start of the BLORP operation.
1224 */
1225 blorp_emit_vs_config(batch, params);
1226 #if GEN_GEN >= 7
1227 blorp_emit(batch, GENX(3DSTATE_HS), hs);
1228 blorp_emit(batch, GENX(3DSTATE_TE), te);
1229 blorp_emit(batch, GENX(3DSTATE_DS), DS);
1230 blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1231 #endif
1232 blorp_emit(batch, GENX(3DSTATE_GS), gs);
1233
1234 blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1235 clip.PerspectiveDivideDisable = true;
1236 }
1237
1238 blorp_emit_sf_config(batch, params);
1239 blorp_emit_ps_config(batch, params);
1240
1241 blorp_emit_cc_viewport(batch, params);
1242 }
1243
1244 /******** This is the end of the pipeline setup code ********/
1245
1246 #endif /* GEN_GEN >= 6 */
1247
1248 #if GEN_GEN >= 7 && GEN_GEN <= 10
1249 static void
1250 blorp_emit_memcpy(struct blorp_batch *batch,
1251 struct blorp_address dst,
1252 struct blorp_address src,
1253 uint32_t size)
1254 {
1255 assert(size % 4 == 0);
1256
1257 for (unsigned dw = 0; dw < size; dw += 4) {
1258 #if GEN_GEN >= 8
1259 blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1260 cp.DestinationMemoryAddress = dst;
1261 cp.SourceMemoryAddress = src;
1262 }
1263 #else
1264 /* IVB does not have a general purpose register for command streamer
1265 * commands. Therefore, we use an alternate temporary register.
1266 */
1267 #define BLORP_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
1268 blorp_emit(batch, GENX(MI_LOAD_REGISTER_MEM), load) {
1269 load.RegisterAddress = BLORP_TEMP_REG;
1270 load.MemoryAddress = src;
1271 }
1272 blorp_emit(batch, GENX(MI_STORE_REGISTER_MEM), store) {
1273 store.RegisterAddress = BLORP_TEMP_REG;
1274 store.MemoryAddress = dst;
1275 }
1276 #undef BLORP_TEMP_REG
1277 #endif
1278 dst.offset += 4;
1279 src.offset += 4;
1280 }
1281 }
1282 #endif
1283
1284 static void
1285 blorp_emit_surface_state(struct blorp_batch *batch,
1286 const struct brw_blorp_surface_info *surface,
1287 void *state, uint32_t state_offset,
1288 const bool color_write_disables[4],
1289 bool is_render_target)
1290 {
1291 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1292 struct isl_surf surf = surface->surf;
1293
1294 if (surf.dim == ISL_SURF_DIM_1D &&
1295 surf.dim_layout == ISL_DIM_LAYOUT_GEN4_2D) {
1296 assert(surf.logical_level0_px.height == 1);
1297 surf.dim = ISL_SURF_DIM_2D;
1298 }
1299
1300 /* Blorp doesn't support HiZ in any of the blit or slow-clear paths */
1301 enum isl_aux_usage aux_usage = surface->aux_usage;
1302 if (aux_usage == ISL_AUX_USAGE_HIZ)
1303 aux_usage = ISL_AUX_USAGE_NONE;
1304
1305 isl_channel_mask_t write_disable_mask = 0;
1306 if (is_render_target && GEN_GEN <= 5) {
1307 if (color_write_disables[0])
1308 write_disable_mask |= ISL_CHANNEL_RED_BIT;
1309 if (color_write_disables[1])
1310 write_disable_mask |= ISL_CHANNEL_GREEN_BIT;
1311 if (color_write_disables[2])
1312 write_disable_mask |= ISL_CHANNEL_BLUE_BIT;
1313 if (color_write_disables[3])
1314 write_disable_mask |= ISL_CHANNEL_ALPHA_BIT;
1315 }
1316
1317 isl_surf_fill_state(batch->blorp->isl_dev, state,
1318 .surf = &surf, .view = &surface->view,
1319 .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1320 .mocs = surface->addr.mocs,
1321 .clear_color = surface->clear_color,
1322 .write_disables = write_disable_mask);
1323
1324 blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1325 surface->addr, 0);
1326
1327 if (aux_usage != ISL_AUX_USAGE_NONE) {
1328 /* On gen7 and prior, the bottom 12 bits of the MCS base address are
1329 * used to store other information. This should be ok, however, because
1330 * surface buffer addresses are always 4K page alinged.
1331 */
1332 assert((surface->aux_addr.offset & 0xfff) == 0);
1333 uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1334 blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1335 surface->aux_addr, *aux_addr);
1336 }
1337
1338 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1339
1340 if (surface->clear_color_addr.buffer) {
1341 #if GEN_GEN > 10
1342 unreachable("Implement indirect clear support on gen11+");
1343 #elif GEN_GEN >= 7 && GEN_GEN <= 10
1344 struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1345 dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1346 blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1347 isl_dev->ss.clear_value_size);
1348 #else
1349 unreachable("Fast clears are only supported on gen7+");
1350 #endif
1351 }
1352 }
1353
1354 static void
1355 blorp_emit_null_surface_state(struct blorp_batch *batch,
1356 const struct brw_blorp_surface_info *surface,
1357 uint32_t *state)
1358 {
1359 struct GENX(RENDER_SURFACE_STATE) ss = {
1360 .SurfaceType = SURFTYPE_NULL,
1361 .SurfaceFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R8G8B8A8_UNORM,
1362 .Width = surface->surf.logical_level0_px.width - 1,
1363 .Height = surface->surf.logical_level0_px.height - 1,
1364 .MIPCountLOD = surface->view.base_level,
1365 .MinimumArrayElement = surface->view.base_array_layer,
1366 .Depth = surface->view.array_len - 1,
1367 .RenderTargetViewExtent = surface->view.array_len - 1,
1368 #if GEN_GEN >= 6
1369 .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1370 #endif
1371
1372 #if GEN_GEN >= 7
1373 .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1374 #endif
1375
1376 #if GEN_GEN >= 8
1377 .TileMode = YMAJOR,
1378 #else
1379 .TiledSurface = true,
1380 #endif
1381 };
1382
1383 GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1384
1385 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1386 }
1387
1388 static void
1389 blorp_emit_surface_states(struct blorp_batch *batch,
1390 const struct blorp_params *params)
1391 {
1392 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1393 uint32_t bind_offset, surface_offsets[2];
1394 void *surface_maps[2];
1395
1396 MAYBE_UNUSED bool has_indirect_clear_color = false;
1397 if (params->use_pre_baked_binding_table) {
1398 bind_offset = params->pre_baked_binding_table_offset;
1399 } else {
1400 unsigned num_surfaces = 1 + params->src.enabled;
1401 blorp_alloc_binding_table(batch, num_surfaces,
1402 isl_dev->ss.size, isl_dev->ss.align,
1403 &bind_offset, surface_offsets, surface_maps);
1404
1405 if (params->dst.enabled) {
1406 blorp_emit_surface_state(batch, &params->dst,
1407 surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1408 surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1409 params->color_write_disable, true);
1410 if (params->dst.clear_color_addr.buffer != NULL)
1411 has_indirect_clear_color = true;
1412 } else {
1413 assert(params->depth.enabled || params->stencil.enabled);
1414 const struct brw_blorp_surface_info *surface =
1415 params->depth.enabled ? &params->depth : &params->stencil;
1416 blorp_emit_null_surface_state(batch, surface,
1417 surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1418 }
1419
1420 if (params->src.enabled) {
1421 blorp_emit_surface_state(batch, &params->src,
1422 surface_maps[BLORP_TEXTURE_BT_INDEX],
1423 surface_offsets[BLORP_TEXTURE_BT_INDEX],
1424 NULL, false);
1425 if (params->src.clear_color_addr.buffer != NULL)
1426 has_indirect_clear_color = true;
1427 }
1428 }
1429
1430 #if GEN_GEN >= 7
1431 if (has_indirect_clear_color) {
1432 /* Updating a surface state object may require that the state cache be
1433 * invalidated. From the SKL PRM, Shared Functions -> State -> State
1434 * Caching:
1435 *
1436 * Whenever the RENDER_SURFACE_STATE object in memory pointed to by
1437 * the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
1438 * modified [...], the L1 state cache must be invalidated to ensure
1439 * the new surface or sampler state is fetched from system memory.
1440 */
1441 blorp_emit(batch, GENX(PIPE_CONTROL), pipe) {
1442 pipe.StateCacheInvalidationEnable = true;
1443 }
1444 }
1445 #endif
1446
1447 #if GEN_GEN >= 7
1448 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1449 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1450 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1451 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1452
1453 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1454 bt.PointertoPSBindingTable = bind_offset;
1455 }
1456 #elif GEN_GEN >= 6
1457 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1458 bt.PSBindingTableChange = true;
1459 bt.PointertoPSBindingTable = bind_offset;
1460 }
1461 #else
1462 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), bt) {
1463 bt.PointertoPSBindingTable = bind_offset;
1464 }
1465 #endif
1466 }
1467
1468 static void
1469 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1470 const struct blorp_params *params)
1471 {
1472 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1473
1474 uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1475 if (dw == NULL)
1476 return;
1477
1478 struct isl_depth_stencil_hiz_emit_info info = { };
1479
1480 if (params->depth.enabled) {
1481 info.view = &params->depth.view;
1482 info.mocs = params->depth.addr.mocs;
1483 } else if (params->stencil.enabled) {
1484 info.view = &params->stencil.view;
1485 info.mocs = params->stencil.addr.mocs;
1486 }
1487
1488 if (params->depth.enabled) {
1489 info.depth_surf = &params->depth.surf;
1490
1491 info.depth_address =
1492 blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1493 params->depth.addr, 0);
1494
1495 info.hiz_usage = params->depth.aux_usage;
1496 if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
1497 info.hiz_surf = &params->depth.aux_surf;
1498
1499 struct blorp_address hiz_address = params->depth.aux_addr;
1500 #if GEN_GEN == 6
1501 /* Sandy bridge hardware does not technically support mipmapped HiZ.
1502 * However, we have a special layout that allows us to make it work
1503 * anyway by manually offsetting to the specified miplevel.
1504 */
1505 assert(info.hiz_surf->dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
1506 uint32_t offset_B;
1507 isl_surf_get_image_offset_B_tile_sa(info.hiz_surf,
1508 info.view->base_level, 0, 0,
1509 &offset_B, NULL, NULL);
1510 hiz_address.offset += offset_B;
1511 #endif
1512
1513 info.hiz_address =
1514 blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1515 hiz_address, 0);
1516
1517 info.depth_clear_value = params->depth.clear_color.f32[0];
1518 }
1519 }
1520
1521 if (params->stencil.enabled) {
1522 info.stencil_surf = &params->stencil.surf;
1523
1524 struct blorp_address stencil_address = params->stencil.addr;
1525 #if GEN_GEN == 6
1526 /* Sandy bridge hardware does not technically support mipmapped stencil.
1527 * However, we have a special layout that allows us to make it work
1528 * anyway by manually offsetting to the specified miplevel.
1529 */
1530 assert(info.stencil_surf->dim_layout == ISL_DIM_LAYOUT_GEN6_STENCIL_HIZ);
1531 uint32_t offset_B;
1532 isl_surf_get_image_offset_B_tile_sa(info.stencil_surf,
1533 info.view->base_level, 0, 0,
1534 &offset_B, NULL, NULL);
1535 stencil_address.offset += offset_B;
1536 #endif
1537
1538 info.stencil_address =
1539 blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1540 stencil_address, 0);
1541 }
1542
1543 isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1544 }
1545
1546 #if GEN_GEN >= 8
1547 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1548 * depth/stencil buffer extents are ignored to handle APIs which perform
1549 * clearing operations without such information.
1550 * */
1551 static void
1552 blorp_emit_gen8_hiz_op(struct blorp_batch *batch,
1553 const struct blorp_params *params)
1554 {
1555 /* We should be performing an operation on a depth or stencil buffer.
1556 */
1557 assert(params->depth.enabled || params->stencil.enabled);
1558
1559 /* The stencil buffer should only be enabled if a fast clear operation is
1560 * requested.
1561 */
1562 if (params->stencil.enabled)
1563 assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1564
1565 /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1566 *
1567 * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1568 * the Number of Multisamples. This packet must not be used to change
1569 * Number of Multisamples in a rendering sequence.
1570 *
1571 * Since HIZ may be the first thing in a batch buffer, play safe and always
1572 * emit 3DSTATE_MULTISAMPLE.
1573 */
1574 blorp_emit_3dstate_multisample(batch, params);
1575
1576 /* If we can't alter the depth stencil config and multiple layers are
1577 * involved, the HiZ op will fail. This is because the op requires that a
1578 * new config is emitted for each additional layer.
1579 */
1580 if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1581 assert(params->num_layers <= 1);
1582 } else {
1583 blorp_emit_depth_stencil_config(batch, params);
1584 }
1585
1586 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1587 switch (params->hiz_op) {
1588 case ISL_AUX_OP_FAST_CLEAR:
1589 hzp.StencilBufferClearEnable = params->stencil.enabled;
1590 hzp.DepthBufferClearEnable = params->depth.enabled;
1591 hzp.StencilClearValue = params->stencil_ref;
1592 hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1593 break;
1594 case ISL_AUX_OP_FULL_RESOLVE:
1595 assert(params->full_surface_hiz_op);
1596 hzp.DepthBufferResolveEnable = true;
1597 break;
1598 case ISL_AUX_OP_AMBIGUATE:
1599 assert(params->full_surface_hiz_op);
1600 hzp.HierarchicalDepthBufferResolveEnable = true;
1601 break;
1602 case ISL_AUX_OP_PARTIAL_RESOLVE:
1603 case ISL_AUX_OP_NONE:
1604 unreachable("Invalid HIZ op");
1605 }
1606
1607 hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1608 hzp.SampleMask = 0xFFFF;
1609
1610 /* Due to a hardware issue, this bit MBZ */
1611 assert(hzp.ScissorRectangleEnable == false);
1612
1613 /* Contrary to the HW docs both fields are inclusive */
1614 hzp.ClearRectangleXMin = params->x0;
1615 hzp.ClearRectangleYMin = params->y0;
1616
1617 /* Contrary to the HW docs both fields are exclusive */
1618 hzp.ClearRectangleXMax = params->x1;
1619 hzp.ClearRectangleYMax = params->y1;
1620 }
1621
1622 /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1623 * to “Write Immediate Data” enabled.
1624 */
1625 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1626 pc.PostSyncOperation = WriteImmediateData;
1627 pc.Address = blorp_get_workaround_page(batch);
1628 }
1629
1630 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1631 }
1632 #endif
1633
1634 /**
1635 * \brief Execute a blit or render pass operation.
1636 *
1637 * To execute the operation, this function manually constructs and emits a
1638 * batch to draw a rectangle primitive. The batchbuffer is flushed before
1639 * constructing and after emitting the batch.
1640 *
1641 * This function alters no GL state.
1642 */
1643 static void
1644 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
1645 {
1646 #if GEN_GEN >= 8
1647 if (params->hiz_op != ISL_AUX_OP_NONE) {
1648 blorp_emit_gen8_hiz_op(batch, params);
1649 return;
1650 }
1651 #endif
1652
1653 blorp_emit_vertex_buffers(batch, params);
1654 blorp_emit_vertex_elements(batch, params);
1655
1656 blorp_emit_pipeline(batch, params);
1657
1658 blorp_emit_surface_states(batch, params);
1659
1660 if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1661 blorp_emit_depth_stencil_config(batch, params);
1662
1663 blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1664 prim.VertexAccessType = SEQUENTIAL;
1665 prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1666 #if GEN_GEN >= 7
1667 prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1668 #endif
1669 prim.VertexCountPerInstance = 3;
1670 prim.InstanceCount = params->num_layers;
1671 }
1672 }
1673
1674 #endif /* BLORP_GENX_EXEC_H */