vc4: Move job-submit skip cases to vc4_job_submit().
[mesa.git] / src / gallium / drivers / ilo / core / ilo_state_compute.c
1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright (C) 2012-2015 LunarG, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Chia-I Wu <olv@lunarg.com>
26 */
27
28 #include "ilo_debug.h"
29 #include "ilo_state_compute.h"
30
31 struct compute_urb_configuration {
32 int idrt_entry_count;
33 int curbe_entry_count;
34
35 int urb_entry_count;
36 /* in 256-bit register increments */
37 int urb_entry_size;
38 };
39
40 static int
41 get_gen6_rob_entry_count(const struct ilo_dev *dev)
42 {
43 ILO_DEV_ASSERT(dev, 6, 8);
44
45 /*
46 * From the Ivy Bridge PRM, volume 2 part 2, page 60:
47 *
48 * "ROB has 64KB of storage; 2048 entries."
49 *
50 * From the valid ranges of "CURBE Allocation Size", we can also conclude
51 * that interface entries and CURBE data must be in ROB. And that ROB
52 * should be 16KB, or 512 entries, on Gen7 GT1.
53 */
54 if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
55 return 2048;
56 else if (ilo_dev_gen(dev) >= ILO_GEN(7))
57 return (dev->gt == 2) ? 2048 : 512;
58 else
59 return (dev->gt == 2) ? 2048 : 1024;
60 }
61
62 static int
63 get_gen6_idrt_entry_count(const struct ilo_dev *dev)
64 {
65 ILO_DEV_ASSERT(dev, 6, 8);
66
67 /*
68 * From the Ivy Bridge PRM, volume 2 part 2, page 21:
69 *
70 * "The first 32 URB entries are reserved for the interface
71 * descriptor..."
72 *
73 * From the Haswell PRM, volume 7, page 836:
74 *
75 * "The first 64 URB entries are reserved for the interface
76 * description..."
77 */
78 return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32;
79 }
80
81 static int
82 get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size)
83 {
84 /*
85 * From the Ivy Bridge PRM, volume 2 part 2, page 21:
86 *
87 * "(CURBE Allocation Size) Specifies the total length allocated for
88 * CURBE, in 256-bit register increments.
89 */
90 const int entry_count = (curbe_size + 31) / 32;
91
92 ILO_DEV_ASSERT(dev, 6, 8);
93
94 assert(get_gen6_idrt_entry_count(dev) + entry_count <=
95 get_gen6_rob_entry_count(dev));
96
97 return entry_count;
98 }
99
100 static bool
101 compute_get_gen6_urb_configuration(const struct ilo_dev *dev,
102 const struct ilo_state_compute_info *info,
103 struct compute_urb_configuration *urb)
104 {
105 ILO_DEV_ASSERT(dev, 6, 8);
106
107 urb->idrt_entry_count = get_gen6_idrt_entry_count(dev);
108 urb->curbe_entry_count =
109 get_gen6_curbe_entry_count(dev, info->curbe_alloc_size);
110
111 /*
112 * From the Broadwell PRM, volume 2b, page 451:
113 *
114 * "Please note that 0 is not allowed for this field (Number of URB
115 * Entries)."
116 */
117 urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0;
118
119 /*
120 * From the Ivy Bridge PRM, volume 2 part 2, page 52:
121 *
122 * "(URB Entry Allocation Size) Specifies the length of each URB entry
123 * used by the unit, in 256-bit register increments - 1."
124 */
125 urb->urb_entry_size = 1;
126
127 /*
128 * From the Ivy Bridge PRM, volume 2 part 2, page 22:
129 *
130 * MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
131 * size and the number of URB handles. The driver must ensure that
132 * ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
133 * URB_allocation_in_L3."
134 */
135 assert(urb->idrt_entry_count + urb->curbe_entry_count +
136 urb->urb_entry_count * urb->urb_entry_size <=
137 info->cv_urb_alloc_size / 32);
138
139 return true;
140 }
141
142 static int
143 compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
144 const struct ilo_state_compute_interface_info *interface)
145 {
146 const int per_thread_read = (interface->curbe_read_length + 31) / 32;
147 const int cross_thread_read =
148 (interface->cross_thread_curbe_read_length + 31) / 32;
149
150 ILO_DEV_ASSERT(dev, 6, 8);
151
152 assert(interface->curbe_read_offset % 32 == 0);
153
154 /*
155 * From the Ivy Bridge PRM, volume 2 part 2, page 60:
156 *
157 * "(Constant URB Entry Read Length) [0,63]"
158 */
159 assert(per_thread_read <= 63);
160
161 /*
162 * From the Haswell PRM, volume 2d, page 199:
163 *
164 * "(Cross-Thread Constant Data Read Length) [0,127]"
165 */
166 if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
167 assert(cross_thread_read <= 127);
168 else
169 assert(!cross_thread_read);
170
171 if (per_thread_read || cross_thread_read) {
172 return interface->curbe_read_offset / 32 + cross_thread_read +
173 per_thread_read * interface->thread_group_size;
174 } else {
175 return 0;
176 }
177 }
178
179 static bool
180 compute_validate_gen6(const struct ilo_dev *dev,
181 const struct ilo_state_compute_info *info,
182 const struct compute_urb_configuration *urb)
183 {
184 int min_curbe_entry_count;
185 uint8_t i;
186
187 ILO_DEV_ASSERT(dev, 6, 8);
188
189 assert(info->interface_count <= urb->idrt_entry_count);
190
191 min_curbe_entry_count = 0;
192 for (i = 0; i < info->interface_count; i++) {
193 const int read_end =
194 compute_interface_get_gen6_read_end(dev, &info->interfaces[i]);
195
196 if (min_curbe_entry_count < read_end)
197 min_curbe_entry_count = read_end;
198 }
199
200 assert(min_curbe_entry_count <= urb->curbe_entry_count);
201
202 /*
203 * From the Broadwell PRM, volume 2b, page 452:
204 *
205 * "CURBE Allocation Size should be 0 for GPGPU workloads that uses
206 * indirect instead of CURBE."
207 */
208 if (!min_curbe_entry_count)
209 assert(!urb->curbe_entry_count);
210
211 return true;
212 }
213
214 static uint32_t
215 compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev,
216 const struct ilo_state_compute_info *info,
217 uint8_t *per_thread_space)
218 {
219 ILO_DEV_ASSERT(dev, 6, 7);
220
221 /*
222 * From the Sandy Bridge PRM, volume 2 part 2, page 30:
223 *
224 * "(Per Thread Scratch Space)
225 * Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
226 */
227 assert(info->per_thread_scratch_size <= 12 * 1024);
228
229 if (!info->per_thread_scratch_size) {
230 *per_thread_space = 0;
231 return 0;
232 }
233
234 *per_thread_space = (info->per_thread_scratch_size > 1024) ?
235 (info->per_thread_scratch_size - 1) / 1024 : 0;
236
237 return 1024 * (1 + *per_thread_space);
238 }
239
240 static uint32_t
241 compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev,
242 const struct ilo_state_compute_info *info,
243 uint8_t *per_thread_space)
244 {
245 ILO_DEV_ASSERT(dev, 7.5, 8);
246
247 /*
248 * From the Haswell PRM, volume 2b, page 407:
249 *
250 * "(Per Thread Scratch Space)
251 * [0,10] Indicating [2k bytes, 2 Mbytes]"
252 *
253 * "Note: The scratch space should be declared as 2x the desired
254 * scratch space. The stack will start at the half-way point instead
255 * of the end. The upper half of scratch space will not be accessed
256 * and so does not have to be allocated in memory."
257 *
258 * From the Broadwell PRM, volume 2a, page 450:
259 *
260 * "(Per Thread Scratch Space)
261 * [0,11] indicating [1k bytes, 2 Mbytes]"
262 */
263 assert(info->per_thread_scratch_size <=
264 ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
265
266 if (!info->per_thread_scratch_size) {
267 *per_thread_space = 0;
268 return 0;
269 }
270
271 /* next power of two, starting from 1KB */
272 *per_thread_space = (info->per_thread_scratch_size > 1024) ?
273 (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
274
275 return 1 << (10 + *per_thread_space);
276 }
277
278 static bool
279 compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
280 const struct ilo_dev *dev,
281 const struct ilo_state_compute_info *info)
282 {
283 struct compute_urb_configuration urb;
284 uint32_t per_thread_size;
285 uint8_t per_thread_space;
286
287 uint32_t dw1, dw2, dw4;
288
289 ILO_DEV_ASSERT(dev, 6, 8);
290
291 if (!compute_get_gen6_urb_configuration(dev, info, &urb) ||
292 !compute_validate_gen6(dev, info, &urb))
293 return false;
294
295 if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
296 per_thread_size = compute_get_gen75_per_thread_scratch_size(dev,
297 info, &per_thread_space);
298 } else {
299 per_thread_size = compute_get_gen6_per_thread_scratch_size(dev,
300 info, &per_thread_space);
301 }
302
303 dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
304
305 dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
306 urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
307 GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
308 GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
309
310 if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
311 dw2 |= GEN7_VFE_DW2_GPGPU_MODE;
312
313 assert(urb.urb_entry_size);
314
315 dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
316 urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT;
317
318 STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3);
319 compute->vfe[0] = dw1;
320 compute->vfe[1] = dw2;
321 compute->vfe[2] = dw4;
322
323 compute->scratch_size = per_thread_size * dev->thread_count;
324
325 return true;
326 }
327
328 static uint8_t
329 compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev,
330 const struct ilo_state_compute_interface_info *interface)
331 {
332 ILO_DEV_ASSERT(dev, 6, 8);
333 return (interface->sampler_count <= 12) ?
334 (interface->sampler_count + 3) / 4 : 4;
335 }
336
337 static uint8_t
338 compute_interface_get_gen6_surface_count(const struct ilo_dev *dev,
339 const struct ilo_state_compute_interface_info *interface)
340 {
341 ILO_DEV_ASSERT(dev, 6, 8);
342 return (interface->surface_count <= 31) ? interface->surface_count : 31;
343 }
344
345 static uint8_t
346 compute_interface_get_gen7_slm_size(const struct ilo_dev *dev,
347 const struct ilo_state_compute_interface_info *interface)
348 {
349 ILO_DEV_ASSERT(dev, 7, 8);
350
351 /*
352 * From the Ivy Bridge PRM, volume 2 part 2, page 61:
353 *
354 * "The amount is specified in 4k blocks, but only powers of 2 are
355 * allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
356 */
357 assert(interface->slm_size <= 64 * 1024);
358
359 return util_next_power_of_two((interface->slm_size + 4095) / 4096);
360 }
361
362 static bool
363 compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute,
364 const struct ilo_dev *dev,
365 const struct ilo_state_compute_info *info)
366 {
367 uint8_t i;
368
369 ILO_DEV_ASSERT(dev, 6, 8);
370
371 for (i = 0; i < info->interface_count; i++) {
372 const struct ilo_state_compute_interface_info *interface =
373 &info->interfaces[i];
374 uint16_t read_offset, per_thread_read_len, cross_thread_read_len;
375 uint8_t sampler_count, surface_count;
376 uint32_t dw0, dw2, dw3, dw4, dw5, dw6;
377
378 assert(interface->kernel_offset % 64 == 0);
379 assert(interface->thread_group_size);
380
381 read_offset = interface->curbe_read_offset / 32;
382 per_thread_read_len = (interface->curbe_read_length + 31) / 32;
383 cross_thread_read_len =
384 (interface->cross_thread_curbe_read_length + 31) / 32;
385
386 sampler_count =
387 compute_interface_get_gen6_sampler_count(dev, interface);
388 surface_count =
389 compute_interface_get_gen6_surface_count(dev, interface);
390
391 dw0 = interface->kernel_offset;
392 dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
393 dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
394 dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
395 read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
396
397 dw5 = 0;
398 dw6 = 0;
399 if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
400 const uint8_t slm_size =
401 compute_interface_get_gen7_slm_size(dev, interface);
402
403 dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
404
405 if (slm_size) {
406 dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE |
407 slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT;
408 }
409
410 /*
411 * From the Haswell PRM, volume 2d, page 199:
412 *
413 * "(Number of Threads in GPGPU Thread Group) Specifies the
414 * number of threads that are in this thread group. Used to
415 * program the barrier for the number of messages to expect. The
416 * minimum value is 0 (which will disable the barrier), while
417 * the maximum value is the number of threads in a subslice for
418 * local barriers."
419 *
420 * From the Broadwell PRM, volume 2d, page 183:
421 *
422 * "(Number of Threads in GPGPU Thread Group) Specifies the
423 * number of threads that are in this thread group. The minimum
424 * value is 1, while the maximum value is the number of threads
425 * in a subslice for local barriers. See vol1b Configurations
426 * for the number of threads per subslice for different
427 * products. The maximum value for global barriers is limited
428 * by the number of threads in the system, or by 511, whichever
429 * is lower. This field should not be set to 0 even if the
430 * barrier is disabled, since an accurate value is needed for
431 * proper pre-emption."
432 */
433 if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) {
434 dw5 |= interface->thread_group_size <<
435 GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
436 }
437
438 if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
439 dw6 |= cross_thread_read_len <<
440 GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT;
441 }
442 }
443
444 STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6);
445 compute->idrt[i][0] = dw0;
446 compute->idrt[i][1] = dw2;
447 compute->idrt[i][2] = dw3;
448 compute->idrt[i][3] = dw4;
449 compute->idrt[i][4] = dw5;
450 compute->idrt[i][5] = dw6;
451 }
452
453 return true;
454 }
455
456 bool
457 ilo_state_compute_init(struct ilo_state_compute *compute,
458 const struct ilo_dev *dev,
459 const struct ilo_state_compute_info *info)
460 {
461 bool ret = true;
462
463 assert(ilo_is_zeroed(compute, sizeof(*compute)));
464 assert(ilo_is_zeroed(info->data, info->data_size));
465
466 assert(ilo_state_compute_data_size(dev, info->interface_count) <=
467 info->data_size);
468 compute->idrt = (uint32_t (*)[6]) info->data;
469
470 ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info);
471 ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info);
472
473 assert(ret);
474
475 return ret;
476 }