2 // Copyright 2012 Francisco Jerez
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
29 using namespace clover
;
31 kernel::kernel(clover::program
&prog
, const std::string
&name
,
32 const std::vector
<module::argument
> &margs
) :
33 program(prog
), _name(name
), exec(*this),
34 program_ref(prog
._kernel_ref_counter
) {
35 for (auto &marg
: margs
) {
36 if (marg
.semantic
== module::argument::general
)
37 _args
.emplace_back(argument::create(marg
));
42 static inline std::vector
<uint
>
43 pad_vector(command_queue
&q
, const V
&v
, uint x
) {
44 std::vector
<uint
> w
{ v
.begin(), v
.end() };
45 w
.resize(q
.device().max_block_size().size(), x
);
50 kernel::launch(command_queue
&q
,
51 const std::vector
<size_t> &grid_offset
,
52 const std::vector
<size_t> &grid_size
,
53 const std::vector
<size_t> &block_size
) {
54 const auto m
= program().binary(q
.device());
55 const auto reduced_grid_size
=
56 map(divides(), grid_size
, block_size
);
57 void *st
= exec
.bind(&q
, grid_offset
);
58 struct pipe_grid_info info
= {};
60 // The handles are created during exec_context::bind(), so we need make
61 // sure to call exec_context::bind() before retrieving them.
62 std::vector
<uint32_t *> g_handles
= map([&](size_t h
) {
63 return (uint32_t *)&exec
.input
[h
];
66 q
.pipe
->bind_compute_state(q
.pipe
, st
);
67 q
.pipe
->bind_sampler_states(q
.pipe
, PIPE_SHADER_COMPUTE
,
68 0, exec
.samplers
.size(),
69 exec
.samplers
.data());
71 q
.pipe
->set_sampler_views(q
.pipe
, PIPE_SHADER_COMPUTE
, 0,
72 exec
.sviews
.size(), exec
.sviews
.data());
73 q
.pipe
->set_compute_resources(q
.pipe
, 0, exec
.resources
.size(),
74 exec
.resources
.data());
75 q
.pipe
->set_global_binding(q
.pipe
, 0, exec
.g_buffers
.size(),
76 exec
.g_buffers
.data(), g_handles
.data());
78 // Fill information for the launch_grid() call.
79 info
.work_dim
= grid_size
.size();
80 copy(pad_vector(q
, block_size
, 1), info
.block
);
81 copy(pad_vector(q
, reduced_grid_size
, 1), info
.grid
);
82 info
.pc
= find(name_equals(_name
), m
.syms
).offset
;
83 info
.input
= exec
.input
.data();
85 q
.pipe
->launch_grid(q
.pipe
, &info
);
87 q
.pipe
->set_global_binding(q
.pipe
, 0, exec
.g_buffers
.size(), NULL
, NULL
);
88 q
.pipe
->set_compute_resources(q
.pipe
, 0, exec
.resources
.size(), NULL
);
89 q
.pipe
->set_sampler_views(q
.pipe
, PIPE_SHADER_COMPUTE
, 0,
90 exec
.sviews
.size(), NULL
);
91 q
.pipe
->bind_sampler_states(q
.pipe
, PIPE_SHADER_COMPUTE
, 0,
92 exec
.samplers
.size(), NULL
);
94 q
.pipe
->memory_barrier(q
.pipe
, PIPE_BARRIER_GLOBAL_BUFFER
);
99 kernel::mem_local() const {
102 for (auto &arg
: args()) {
103 if (dynamic_cast<local_argument
*>(&arg
))
111 kernel::mem_private() const {
116 kernel::name() const {
121 kernel::optimal_block_size(const command_queue
&q
,
122 const std::vector
<size_t> &grid_size
) const {
123 return factor::find_grid_optimal_factor
<size_t>(
124 q
.device().max_threads_per_block(), q
.device().max_block_size(),
129 kernel::required_block_size() const {
133 kernel::argument_range
135 return map(derefs(), _args
);
138 kernel::const_argument_range
139 kernel::args() const {
140 return map(derefs(), _args
);
144 kernel::module(const command_queue
&q
) const {
145 return program().binary(q
.device());
148 kernel::exec_context::exec_context(kernel
&kern
) :
149 kern(kern
), q(NULL
), mem_local(0), st(NULL
), cs() {
152 kernel::exec_context::~exec_context() {
154 q
->pipe
->delete_compute_state(q
->pipe
, st
);
158 kernel::exec_context::bind(intrusive_ptr
<command_queue
> _q
,
159 const std::vector
<size_t> &grid_offset
) {
162 // Bind kernel arguments.
163 auto &m
= kern
.program().binary(q
->device());
164 auto margs
= find(name_equals(kern
.name()), m
.syms
).args
;
165 auto msec
= find(type_equals(module::section::text
), m
.secs
);
166 auto explicit_arg
= kern
._args
.begin();
168 for (auto &marg
: margs
) {
169 switch (marg
.semantic
) {
170 case module::argument::general
:
171 (*(explicit_arg
++))->bind(*this, marg
);
174 case module::argument::grid_dimension
: {
175 const cl_uint dimension
= grid_offset
.size();
176 auto arg
= argument::create(marg
);
178 arg
->set(sizeof(dimension
), &dimension
);
179 arg
->bind(*this, marg
);
182 case module::argument::grid_offset
: {
183 for (cl_uint x
: pad_vector(*q
, grid_offset
, 0)) {
184 auto arg
= argument::create(marg
);
186 arg
->set(sizeof(x
), &x
);
187 arg
->bind(*this, marg
);
191 case module::argument::image_size
: {
192 auto img
= dynamic_cast<image_argument
&>(**(explicit_arg
- 1)).get();
193 std::vector
<cl_uint
> image_size
{
194 static_cast<cl_uint
>(img
->width()),
195 static_cast<cl_uint
>(img
->height()),
196 static_cast<cl_uint
>(img
->depth())};
197 for (auto x
: image_size
) {
198 auto arg
= argument::create(marg
);
200 arg
->set(sizeof(x
), &x
);
201 arg
->bind(*this, marg
);
205 case module::argument::image_format
: {
206 auto img
= dynamic_cast<image_argument
&>(**(explicit_arg
- 1)).get();
207 cl_image_format fmt
= img
->format();
208 std::vector
<cl_uint
> image_format
{
209 static_cast<cl_uint
>(fmt
.image_channel_data_type
),
210 static_cast<cl_uint
>(fmt
.image_channel_order
)};
211 for (auto x
: image_format
) {
212 auto arg
= argument::create(marg
);
214 arg
->set(sizeof(x
), &x
);
215 arg
->bind(*this, marg
);
222 // Create a new compute state if anything changed.
223 if (!st
|| q
!= _q
||
224 cs
.req_local_mem
!= mem_local
||
225 cs
.req_input_mem
!= input
.size()) {
227 _q
->pipe
->delete_compute_state(_q
->pipe
, st
);
229 cs
.ir_type
= q
->device().ir_format();
230 cs
.prog
= &(msec
.data
[0]);
231 cs
.req_local_mem
= mem_local
;
232 cs
.req_input_mem
= input
.size();
233 st
= q
->pipe
->create_compute_state(q
->pipe
, &cs
);
240 kernel::exec_context::unbind() {
241 for (auto &arg
: kern
.args())
257 return { (uint8_t *)&x
, (uint8_t *)&x
+ sizeof(x
) };
261 /// Transform buffer \a v from the native byte order into the byte
262 /// order specified by \a e.
266 byteswap(T
&v
, pipe_endian e
) {
267 if (PIPE_ENDIAN_NATIVE
!= e
)
268 std::reverse(v
.begin(), v
.end());
272 /// Pad buffer \a v to the next multiple of \a n.
276 align(T
&v
, size_t n
) {
277 v
.resize(util_align_npot(v
.size(), n
));
281 msb(const std::vector
<uint8_t> &s
) {
282 if (PIPE_ENDIAN_NATIVE
== PIPE_ENDIAN_LITTLE
)
283 return s
.back() & 0x80;
285 return s
.front() & 0x80;
289 /// Resize buffer \a v to size \a n using sign or zero extension
290 /// according to \a ext.
294 extend(T
&v
, enum module::argument::ext_type ext
, size_t n
) {
295 const size_t m
= std::min(v
.size(), n
);
296 const bool sign_ext
= (ext
== module::argument::sign_ext
);
297 const uint8_t fill
= (sign_ext
&& msb(v
) ? ~0 : 0);
300 if (PIPE_ENDIAN_NATIVE
== PIPE_ENDIAN_LITTLE
)
301 std::copy_n(v
.begin(), m
, w
.begin());
303 std::copy_n(v
.end() - m
, m
, w
.end() - m
);
309 /// Append buffer \a w to \a v.
313 insert(T
&v
, const T
&w
) {
314 v
.insert(v
.end(), w
.begin(), w
.end());
318 /// Append \a n elements to the end of buffer \a v.
322 allocate(T
&v
, size_t n
) {
323 size_t pos
= v
.size();
329 std::unique_ptr
<kernel::argument
>
330 kernel::argument::create(const module::argument
&marg
) {
332 case module::argument::scalar
:
333 return std::unique_ptr
<kernel::argument
>(new scalar_argument(marg
.size
));
335 case module::argument::global
:
336 return std::unique_ptr
<kernel::argument
>(new global_argument
);
338 case module::argument::local
:
339 return std::unique_ptr
<kernel::argument
>(new local_argument
);
341 case module::argument::constant
:
342 return std::unique_ptr
<kernel::argument
>(new constant_argument
);
344 case module::argument::image2d_rd
:
345 case module::argument::image3d_rd
:
346 return std::unique_ptr
<kernel::argument
>(new image_rd_argument
);
348 case module::argument::image2d_wr
:
349 case module::argument::image3d_wr
:
350 return std::unique_ptr
<kernel::argument
>(new image_wr_argument
);
352 case module::argument::sampler
:
353 return std::unique_ptr
<kernel::argument
>(new sampler_argument
);
356 throw error(CL_INVALID_KERNEL_DEFINITION
);
359 kernel::argument::argument() : _set(false) {
363 kernel::argument::set() const {
368 kernel::argument::storage() const {
372 kernel::scalar_argument::scalar_argument(size_t size
) : size(size
) {
376 kernel::scalar_argument::set(size_t size
, const void *value
) {
378 throw error(CL_INVALID_ARG_VALUE
);
380 if (size
!= this->size
)
381 throw error(CL_INVALID_ARG_SIZE
);
383 v
= { (uint8_t *)value
, (uint8_t *)value
+ size
};
388 kernel::scalar_argument::bind(exec_context
&ctx
,
389 const module::argument
&marg
) {
392 extend(w
, marg
.ext_type
, marg
.target_size
);
393 byteswap(w
, ctx
.q
->device().endianness());
394 align(ctx
.input
, marg
.target_align
);
395 insert(ctx
.input
, w
);
399 kernel::scalar_argument::unbind(exec_context
&ctx
) {
403 kernel::global_argument::set(size_t size
, const void *value
) {
404 if (size
!= sizeof(cl_mem
))
405 throw error(CL_INVALID_ARG_SIZE
);
407 buf
= pobj
<buffer
>(value
? *(cl_mem
*)value
: NULL
);
412 kernel::global_argument::bind(exec_context
&ctx
,
413 const module::argument
&marg
) {
414 align(ctx
.input
, marg
.target_align
);
417 const resource
&r
= buf
->resource(*ctx
.q
);
418 ctx
.g_handles
.push_back(ctx
.input
.size());
419 ctx
.g_buffers
.push_back(r
.pipe
);
421 // How to handle multi-demensional offsets?
422 // We don't need to. Buffer offsets are always
424 auto v
= bytes(r
.offset
[0]);
425 extend(v
, marg
.ext_type
, marg
.target_size
);
426 byteswap(v
, ctx
.q
->device().endianness());
427 insert(ctx
.input
, v
);
430 allocate(ctx
.input
, marg
.target_size
);
435 kernel::global_argument::unbind(exec_context
&ctx
) {
439 kernel::local_argument::storage() const {
444 kernel::local_argument::set(size_t size
, const void *value
) {
446 throw error(CL_INVALID_ARG_VALUE
);
449 throw error(CL_INVALID_ARG_SIZE
);
456 kernel::local_argument::bind(exec_context
&ctx
,
457 const module::argument
&marg
) {
458 auto v
= bytes(ctx
.mem_local
);
460 extend(v
, module::argument::zero_ext
, marg
.target_size
);
461 byteswap(v
, ctx
.q
->device().endianness());
462 align(ctx
.input
, marg
.target_align
);
463 insert(ctx
.input
, v
);
465 ctx
.mem_local
+= _storage
;
469 kernel::local_argument::unbind(exec_context
&ctx
) {
473 kernel::constant_argument::set(size_t size
, const void *value
) {
474 if (size
!= sizeof(cl_mem
))
475 throw error(CL_INVALID_ARG_SIZE
);
477 buf
= pobj
<buffer
>(value
? *(cl_mem
*)value
: NULL
);
482 kernel::constant_argument::bind(exec_context
&ctx
,
483 const module::argument
&marg
) {
484 align(ctx
.input
, marg
.target_align
);
487 resource
&r
= buf
->resource(*ctx
.q
);
488 auto v
= bytes(ctx
.resources
.size() << 24 | r
.offset
[0]);
490 extend(v
, module::argument::zero_ext
, marg
.target_size
);
491 byteswap(v
, ctx
.q
->device().endianness());
492 insert(ctx
.input
, v
);
494 st
= r
.bind_surface(*ctx
.q
, false);
495 ctx
.resources
.push_back(st
);
498 allocate(ctx
.input
, marg
.target_size
);
503 kernel::constant_argument::unbind(exec_context
&ctx
) {
505 buf
->resource(*ctx
.q
).unbind_surface(*ctx
.q
, st
);
509 kernel::image_rd_argument::set(size_t size
, const void *value
) {
511 throw error(CL_INVALID_ARG_VALUE
);
513 if (size
!= sizeof(cl_mem
))
514 throw error(CL_INVALID_ARG_SIZE
);
516 img
= &obj
<image
>(*(cl_mem
*)value
);
521 kernel::image_rd_argument::bind(exec_context
&ctx
,
522 const module::argument
&marg
) {
523 auto v
= bytes(ctx
.sviews
.size());
525 extend(v
, module::argument::zero_ext
, marg
.target_size
);
526 byteswap(v
, ctx
.q
->device().endianness());
527 align(ctx
.input
, marg
.target_align
);
528 insert(ctx
.input
, v
);
530 st
= img
->resource(*ctx
.q
).bind_sampler_view(*ctx
.q
);
531 ctx
.sviews
.push_back(st
);
535 kernel::image_rd_argument::unbind(exec_context
&ctx
) {
536 img
->resource(*ctx
.q
).unbind_sampler_view(*ctx
.q
, st
);
540 kernel::image_wr_argument::set(size_t size
, const void *value
) {
542 throw error(CL_INVALID_ARG_VALUE
);
544 if (size
!= sizeof(cl_mem
))
545 throw error(CL_INVALID_ARG_SIZE
);
547 img
= &obj
<image
>(*(cl_mem
*)value
);
552 kernel::image_wr_argument::bind(exec_context
&ctx
,
553 const module::argument
&marg
) {
554 auto v
= bytes(ctx
.resources
.size());
556 extend(v
, module::argument::zero_ext
, marg
.target_size
);
557 byteswap(v
, ctx
.q
->device().endianness());
558 align(ctx
.input
, marg
.target_align
);
559 insert(ctx
.input
, v
);
561 st
= img
->resource(*ctx
.q
).bind_surface(*ctx
.q
, true);
562 ctx
.resources
.push_back(st
);
566 kernel::image_wr_argument::unbind(exec_context
&ctx
) {
567 img
->resource(*ctx
.q
).unbind_surface(*ctx
.q
, st
);
571 kernel::sampler_argument::set(size_t size
, const void *value
) {
573 throw error(CL_INVALID_SAMPLER
);
575 if (size
!= sizeof(cl_sampler
))
576 throw error(CL_INVALID_ARG_SIZE
);
578 s
= &obj(*(cl_sampler
*)value
);
583 kernel::sampler_argument::bind(exec_context
&ctx
,
584 const module::argument
&marg
) {
585 st
= s
->bind(*ctx
.q
);
586 ctx
.samplers
.push_back(st
);
590 kernel::sampler_argument::unbind(exec_context
&ctx
) {
591 s
->unbind(*ctx
.q
, st
);