2 * Mesa 3-D graphics library
4 * Copyright (C) 2012-2015 LunarG, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
25 * Chia-I Wu <olv@lunarg.com>
28 #include "ilo_debug.h"
29 #include "ilo_state_compute.h"
31 struct compute_urb_configuration
{
33 int curbe_entry_count
;
36 /* in 256-bit register increments */
41 get_gen6_rob_entry_count(const struct ilo_dev
*dev
)
43 ILO_DEV_ASSERT(dev
, 6, 8);
46 * From the Ivy Bridge PRM, volume 2 part 2, page 60:
48 * "ROB has 64KB of storage; 2048 entries."
50 * From the valid ranges of "CURBE Allocation Size", we can also conclude
51 * that interface entries and CURBE data must be in ROB. And that ROB
52 * should be 16KB, or 512 entries, on Gen7 GT1.
54 if (ilo_dev_gen(dev
) >= ILO_GEN(7.5))
56 else if (ilo_dev_gen(dev
) >= ILO_GEN(7))
57 return (dev
->gt
== 2) ? 2048 : 512;
59 return (dev
->gt
== 2) ? 2048 : 1024;
63 get_gen6_idrt_entry_count(const struct ilo_dev
*dev
)
65 ILO_DEV_ASSERT(dev
, 6, 8);
68 * From the Ivy Bridge PRM, volume 2 part 2, page 21:
70 * "The first 32 URB entries are reserved for the interface
73 * From the Haswell PRM, volume 7, page 836:
75 * "The first 64 URB entries are reserved for the interface
78 return (ilo_dev_gen(dev
) >= ILO_GEN(7.5)) ? 64 : 32;
82 get_gen6_curbe_entry_count(const struct ilo_dev
*dev
, uint32_t curbe_size
)
85 * From the Ivy Bridge PRM, volume 2 part 2, page 21:
87 * "(CURBE Allocation Size) Specifies the total length allocated for
88 * CURBE, in 256-bit register increments.
90 const int entry_count
= (curbe_size
+ 31) / 32;
92 ILO_DEV_ASSERT(dev
, 6, 8);
94 assert(get_gen6_idrt_entry_count(dev
) + entry_count
<=
95 get_gen6_rob_entry_count(dev
));
101 compute_get_gen6_urb_configuration(const struct ilo_dev
*dev
,
102 const struct ilo_state_compute_info
*info
,
103 struct compute_urb_configuration
*urb
)
105 ILO_DEV_ASSERT(dev
, 6, 8);
107 urb
->idrt_entry_count
= get_gen6_idrt_entry_count(dev
);
108 urb
->curbe_entry_count
=
109 get_gen6_curbe_entry_count(dev
, info
->curbe_alloc_size
);
112 * From the Broadwell PRM, volume 2b, page 451:
114 * "Please note that 0 is not allowed for this field (Number of URB
117 urb
->urb_entry_count
= (ilo_dev_gen(dev
) >= ILO_GEN(8)) ? 1 : 0;
120 * From the Ivy Bridge PRM, volume 2 part 2, page 52:
122 * "(URB Entry Allocation Size) Specifies the length of each URB entry
123 * used by the unit, in 256-bit register increments - 1."
125 urb
->urb_entry_size
= 1;
128 * From the Ivy Bridge PRM, volume 2 part 2, page 22:
130 * MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
131 * size and the number of URB handles. The driver must ensure that
132 * ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
133 * URB_allocation_in_L3."
135 assert(urb
->idrt_entry_count
+ urb
->curbe_entry_count
+
136 urb
->urb_entry_count
* urb
->urb_entry_size
<=
137 info
->cv_urb_alloc_size
/ 32);
143 compute_interface_get_gen6_read_end(const struct ilo_dev
*dev
,
144 const struct ilo_state_compute_interface_info
*interface
)
146 const int per_thread_read
= (interface
->curbe_read_length
+ 31) / 32;
147 const int cross_thread_read
=
148 (interface
->cross_thread_curbe_read_length
+ 31) / 32;
150 ILO_DEV_ASSERT(dev
, 6, 8);
152 assert(interface
->curbe_read_offset
% 32 == 0);
155 * From the Ivy Bridge PRM, volume 2 part 2, page 60:
157 * "(Constant URB Entry Read Length) [0,63]"
159 assert(per_thread_read
<= 63);
162 * From the Haswell PRM, volume 2d, page 199:
164 * "(Cross-Thread Constant Data Read Length) [0,127]"
166 if (ilo_dev_gen(dev
) >= ILO_GEN(7.5))
167 assert(cross_thread_read
<= 127);
169 assert(!cross_thread_read
);
171 if (per_thread_read
|| cross_thread_read
) {
172 return interface
->curbe_read_offset
/ 32 + cross_thread_read
+
173 per_thread_read
* interface
->thread_group_size
;
180 compute_validate_gen6(const struct ilo_dev
*dev
,
181 const struct ilo_state_compute_info
*info
,
182 const struct compute_urb_configuration
*urb
)
184 int min_curbe_entry_count
;
187 ILO_DEV_ASSERT(dev
, 6, 8);
189 assert(info
->interface_count
<= urb
->idrt_entry_count
);
191 min_curbe_entry_count
= 0;
192 for (i
= 0; i
< info
->interface_count
; i
++) {
194 compute_interface_get_gen6_read_end(dev
, &info
->interfaces
[i
]);
196 if (min_curbe_entry_count
< read_end
)
197 min_curbe_entry_count
= read_end
;
200 assert(min_curbe_entry_count
<= urb
->curbe_entry_count
);
203 * From the Broadwell PRM, volume 2b, page 452:
205 * "CURBE Allocation Size should be 0 for GPGPU workloads that uses
206 * indirect instead of CURBE."
208 if (!min_curbe_entry_count
)
209 assert(!urb
->curbe_entry_count
);
215 compute_get_gen6_per_thread_scratch_size(const struct ilo_dev
*dev
,
216 const struct ilo_state_compute_info
*info
,
217 uint8_t *per_thread_space
)
219 ILO_DEV_ASSERT(dev
, 6, 7);
222 * From the Sandy Bridge PRM, volume 2 part 2, page 30:
224 * "(Per Thread Scratch Space)
225 * Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
227 assert(info
->per_thread_scratch_size
<= 12 * 1024);
229 if (!info
->per_thread_scratch_size
) {
230 *per_thread_space
= 0;
234 *per_thread_space
= (info
->per_thread_scratch_size
> 1024) ?
235 (info
->per_thread_scratch_size
- 1) / 1024 : 0;
237 return 1024 * (1 + *per_thread_space
);
241 compute_get_gen75_per_thread_scratch_size(const struct ilo_dev
*dev
,
242 const struct ilo_state_compute_info
*info
,
243 uint8_t *per_thread_space
)
245 ILO_DEV_ASSERT(dev
, 7.5, 8);
248 * From the Haswell PRM, volume 2b, page 407:
250 * "(Per Thread Scratch Space)
251 * [0,10] Indicating [2k bytes, 2 Mbytes]"
253 * "Note: The scratch space should be declared as 2x the desired
254 * scratch space. The stack will start at the half-way point instead
255 * of the end. The upper half of scratch space will not be accessed
256 * and so does not have to be allocated in memory."
258 * From the Broadwell PRM, volume 2a, page 450:
260 * "(Per Thread Scratch Space)
261 * [0,11] indicating [1k bytes, 2 Mbytes]"
263 assert(info
->per_thread_scratch_size
<=
264 ((ilo_dev_gen(dev
) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
266 if (!info
->per_thread_scratch_size
) {
267 *per_thread_space
= 0;
271 /* next power of two, starting from 1KB */
272 *per_thread_space
= (info
->per_thread_scratch_size
> 1024) ?
273 (util_last_bit(info
->per_thread_scratch_size
- 1) - 10) : 0;
275 return 1 << (10 + *per_thread_space
);
279 compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute
*compute
,
280 const struct ilo_dev
*dev
,
281 const struct ilo_state_compute_info
*info
)
283 struct compute_urb_configuration urb
;
284 uint32_t per_thread_size
;
285 uint8_t per_thread_space
;
287 uint32_t dw1
, dw2
, dw4
;
289 ILO_DEV_ASSERT(dev
, 6, 8);
291 if (!compute_get_gen6_urb_configuration(dev
, info
, &urb
) ||
292 !compute_validate_gen6(dev
, info
, &urb
))
295 if (ilo_dev_gen(dev
) >= ILO_GEN(7.5)) {
296 per_thread_size
= compute_get_gen75_per_thread_scratch_size(dev
,
297 info
, &per_thread_space
);
299 per_thread_size
= compute_get_gen6_per_thread_scratch_size(dev
,
300 info
, &per_thread_space
);
303 dw1
= per_thread_space
<< GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT
;
305 dw2
= (dev
->thread_count
- 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT
|
306 urb
.urb_entry_count
<< GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT
|
307 GEN6_VFE_DW2_RESET_GATEWAY_TIMER
|
308 GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL
;
310 if (ilo_dev_gen(dev
) >= ILO_GEN(7) && ilo_dev_gen(dev
) <= ILO_GEN(7.5))
311 dw2
|= GEN7_VFE_DW2_GPGPU_MODE
;
313 assert(urb
.urb_entry_size
);
315 dw4
= (urb
.urb_entry_size
- 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT
|
316 urb
.curbe_entry_count
<< GEN6_VFE_DW4_CURBE_SIZE__SHIFT
;
318 STATIC_ASSERT(ARRAY_SIZE(compute
->vfe
) >= 3);
319 compute
->vfe
[0] = dw1
;
320 compute
->vfe
[1] = dw2
;
321 compute
->vfe
[2] = dw4
;
323 compute
->scratch_size
= per_thread_size
* dev
->thread_count
;
329 compute_interface_get_gen6_sampler_count(const struct ilo_dev
*dev
,
330 const struct ilo_state_compute_interface_info
*interface
)
332 ILO_DEV_ASSERT(dev
, 6, 8);
333 return (interface
->sampler_count
<= 12) ?
334 (interface
->sampler_count
+ 3) / 4 : 4;
338 compute_interface_get_gen6_surface_count(const struct ilo_dev
*dev
,
339 const struct ilo_state_compute_interface_info
*interface
)
341 ILO_DEV_ASSERT(dev
, 6, 8);
342 return (interface
->surface_count
<= 31) ? interface
->surface_count
: 31;
346 compute_interface_get_gen7_slm_size(const struct ilo_dev
*dev
,
347 const struct ilo_state_compute_interface_info
*interface
)
349 ILO_DEV_ASSERT(dev
, 7, 8);
352 * From the Ivy Bridge PRM, volume 2 part 2, page 61:
354 * "The amount is specified in 4k blocks, but only powers of 2 are
355 * allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
357 assert(interface
->slm_size
<= 64 * 1024);
359 return util_next_power_of_two((interface
->slm_size
+ 4095) / 4096);
363 compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute
*compute
,
364 const struct ilo_dev
*dev
,
365 const struct ilo_state_compute_info
*info
)
369 ILO_DEV_ASSERT(dev
, 6, 8);
371 for (i
= 0; i
< info
->interface_count
; i
++) {
372 const struct ilo_state_compute_interface_info
*interface
=
373 &info
->interfaces
[i
];
374 uint16_t read_offset
, per_thread_read_len
, cross_thread_read_len
;
375 uint8_t sampler_count
, surface_count
;
376 uint32_t dw0
, dw2
, dw3
, dw4
, dw5
, dw6
;
378 assert(interface
->kernel_offset
% 64 == 0);
379 assert(interface
->thread_group_size
);
381 read_offset
= interface
->curbe_read_offset
/ 32;
382 per_thread_read_len
= (interface
->curbe_read_length
+ 31) / 32;
383 cross_thread_read_len
=
384 (interface
->cross_thread_curbe_read_length
+ 31) / 32;
387 compute_interface_get_gen6_sampler_count(dev
, interface
);
389 compute_interface_get_gen6_surface_count(dev
, interface
);
391 dw0
= interface
->kernel_offset
;
392 dw2
= sampler_count
<< GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT
;
393 dw3
= surface_count
<< GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT
;
394 dw4
= per_thread_read_len
<< GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT
|
395 read_offset
<< GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT
;
399 if (ilo_dev_gen(dev
) >= ILO_GEN(7)) {
400 const uint8_t slm_size
=
401 compute_interface_get_gen7_slm_size(dev
, interface
);
403 dw5
|= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE
;
406 dw5
|= GEN7_IDRT_DW5_BARRIER_ENABLE
|
407 slm_size
<< GEN7_IDRT_DW5_SLM_SIZE__SHIFT
;
411 * From the Haswell PRM, volume 2d, page 199:
413 * "(Number of Threads in GPGPU Thread Group) Specifies the
414 * number of threads that are in this thread group. Used to
415 * program the barrier for the number of messages to expect. The
416 * minimum value is 0 (which will disable the barrier), while
417 * the maximum value is the number of threads in a subslice for
420 * From the Broadwell PRM, volume 2d, page 183:
422 * "(Number of Threads in GPGPU Thread Group) Specifies the
423 * number of threads that are in this thread group. The minimum
424 * value is 1, while the maximum value is the number of threads
425 * in a subslice for local barriers. See vol1b Configurations
426 * for the number of threads per subslice for different
427 * products. The maximum value for global barriers is limited
428 * by the number of threads in the system, or by 511, whichever
429 * is lower. This field should not be set to 0 even if the
430 * barrier is disabled, since an accurate value is needed for
431 * proper pre-emption."
433 if (slm_size
|| ilo_dev_gen(dev
) >= ILO_GEN(8)) {
434 dw5
|= interface
->thread_group_size
<<
435 GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT
;
438 if (ilo_dev_gen(dev
) >= ILO_GEN(7.5)) {
439 dw6
|= cross_thread_read_len
<<
440 GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT
;
444 STATIC_ASSERT(ARRAY_SIZE(compute
->idrt
[i
]) >= 6);
445 compute
->idrt
[i
][0] = dw0
;
446 compute
->idrt
[i
][1] = dw2
;
447 compute
->idrt
[i
][2] = dw3
;
448 compute
->idrt
[i
][3] = dw4
;
449 compute
->idrt
[i
][4] = dw5
;
450 compute
->idrt
[i
][5] = dw6
;
457 ilo_state_compute_init(struct ilo_state_compute
*compute
,
458 const struct ilo_dev
*dev
,
459 const struct ilo_state_compute_info
*info
)
463 assert(ilo_is_zeroed(compute
, sizeof(*compute
)));
464 assert(ilo_is_zeroed(info
->data
, info
->data_size
));
466 assert(ilo_state_compute_data_size(dev
, info
->interface_count
) <=
468 compute
->idrt
= (uint32_t (*)[6]) info
->data
;
470 ret
&= compute_set_gen6_MEDIA_VFE_STATE(compute
, dev
, info
);
471 ret
&= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute
, dev
, info
);