2 // Copyright 2012 Francisco Jerez
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/u_math.h"
26 #include "pipe/p_context.h"
28 using namespace clover
;
30 kernel::kernel(program
&prog
, const std::string
&name
,
31 const std::vector
<module::argument
> &margs
) :
32 prog(prog
), _name(name
), exec(*this) {
33 for (auto &marg
: margs
) {
34 if (marg
.type
== module::argument::scalar
)
35 _args
.emplace_back(new scalar_argument(marg
.size
));
36 else if (marg
.type
== module::argument::global
)
37 _args
.emplace_back(new global_argument
);
38 else if (marg
.type
== module::argument::local
)
39 _args
.emplace_back(new local_argument
);
40 else if (marg
.type
== module::argument::constant
)
41 _args
.emplace_back(new constant_argument
);
42 else if (marg
.type
== module::argument::image2d_rd
||
43 marg
.type
== module::argument::image3d_rd
)
44 _args
.emplace_back(new image_rd_argument
);
45 else if (marg
.type
== module::argument::image2d_wr
||
46 marg
.type
== module::argument::image3d_wr
)
47 _args
.emplace_back(new image_wr_argument
);
48 else if (marg
.type
== module::argument::sampler
)
49 _args
.emplace_back(new sampler_argument
);
51 throw error(CL_INVALID_KERNEL_DEFINITION
);
56 static inline std::vector
<uint
>
57 pad_vector(command_queue
&q
, const V
&v
, uint x
) {
58 std::vector
<uint
> w
{ v
.begin(), v
.end() };
59 w
.resize(q
.dev
.max_block_size().size(), x
);
64 kernel::launch(command_queue
&q
,
65 const std::vector
<size_t> &grid_offset
,
66 const std::vector
<size_t> &grid_size
,
67 const std::vector
<size_t> &block_size
) {
68 const auto m
= prog
.binary(q
.dev
);
69 const auto reduced_grid_size
=
70 map(divides(), grid_size
, block_size
);
71 void *st
= exec
.bind(&q
);
73 // The handles are created during exec_context::bind(), so we need make
74 // sure to call exec_context::bind() before retrieving them.
75 std::vector
<uint32_t *> g_handles
= map([&](size_t h
) {
76 return (uint32_t *)&exec
.input
[h
];
79 q
.pipe
->bind_compute_state(q
.pipe
, st
);
80 q
.pipe
->bind_sampler_states(q
.pipe
, PIPE_SHADER_COMPUTE
,
81 0, exec
.samplers
.size(),
82 exec
.samplers
.data());
84 q
.pipe
->set_sampler_views(q
.pipe
, PIPE_SHADER_COMPUTE
, 0,
85 exec
.sviews
.size(), exec
.sviews
.data());
86 q
.pipe
->set_compute_resources(q
.pipe
, 0, exec
.resources
.size(),
87 exec
.resources
.data());
88 q
.pipe
->set_global_binding(q
.pipe
, 0, exec
.g_buffers
.size(),
89 exec
.g_buffers
.data(), g_handles
.data());
91 q
.pipe
->launch_grid(q
.pipe
,
92 pad_vector(q
, block_size
, 1).data(),
93 pad_vector(q
, reduced_grid_size
, 1).data(),
94 find(name_equals(_name
), m
.syms
).offset
,
97 q
.pipe
->set_global_binding(q
.pipe
, 0, exec
.g_buffers
.size(), NULL
, NULL
);
98 q
.pipe
->set_compute_resources(q
.pipe
, 0, exec
.resources
.size(), NULL
);
99 q
.pipe
->set_sampler_views(q
.pipe
, PIPE_SHADER_COMPUTE
, 0,
100 exec
.sviews
.size(), NULL
);
101 q
.pipe
->bind_sampler_states(q
.pipe
, PIPE_SHADER_COMPUTE
, 0,
102 exec
.samplers
.size(), NULL
);
107 kernel::mem_local() const {
110 for (auto &arg
: args()) {
111 if (dynamic_cast<local_argument
*>(&arg
))
119 kernel::mem_private() const {
124 kernel::max_block_size() const {
125 return std::numeric_limits
<std::size_t>::max();
129 kernel::name() const {
134 kernel::block_size() const {
138 kernel::argument_range
140 return map(derefs(), _args
);
143 kernel::const_argument_range
144 kernel::args() const {
145 return map(derefs(), _args
);
149 kernel::module(const command_queue
&q
) const {
150 return prog
.binary(q
.dev
);
153 kernel::exec_context::exec_context(kernel
&kern
) :
154 kern(kern
), q(NULL
), mem_local(0), st(NULL
), cs() {
157 kernel::exec_context::~exec_context() {
159 q
->pipe
->delete_compute_state(q
->pipe
, st
);
163 kernel::exec_context::bind(command_queue
*_q
) {
166 // Bind kernel arguments.
167 auto &m
= kern
.prog
.binary(q
->dev
);
168 auto margs
= find(name_equals(kern
.name()), m
.syms
).args
;
169 auto msec
= find(type_equals(module::section::text
), m
.secs
);
171 for_each([=](kernel::argument
&karg
, const module::argument
&marg
) {
172 karg
.bind(*this, marg
);
173 }, kern
.args(), margs
);
175 // Create a new compute state if anything changed.
176 if (!st
|| q
!= _q
||
177 cs
.req_local_mem
!= mem_local
||
178 cs
.req_input_mem
!= input
.size()) {
180 _q
->pipe
->delete_compute_state(_q
->pipe
, st
);
182 cs
.prog
= msec
.data
.begin();
183 cs
.req_local_mem
= mem_local
;
184 cs
.req_input_mem
= input
.size();
185 st
= q
->pipe
->create_compute_state(q
->pipe
, &cs
);
192 kernel::exec_context::unbind() {
193 for (auto &arg
: kern
.args())
209 return { (uint8_t *)&x
, (uint8_t *)&x
+ sizeof(x
) };
213 /// Transform buffer \a v from the native byte order into the byte
214 /// order specified by \a e.
218 byteswap(T
&v
, pipe_endian e
) {
219 if (PIPE_ENDIAN_NATIVE
!= e
)
220 std::reverse(v
.begin(), v
.end());
224 /// Pad buffer \a v to the next multiple of \a n.
228 align(T
&v
, size_t n
) {
229 v
.resize(util_align_npot(v
.size(), n
));
233 msb(const std::vector
<uint8_t> &s
) {
234 if (PIPE_ENDIAN_NATIVE
== PIPE_ENDIAN_LITTLE
)
235 return s
.back() & 0x80;
237 return s
.front() & 0x80;
241 /// Resize buffer \a v to size \a n using sign or zero extension
242 /// according to \a ext.
246 extend(T
&v
, enum module::argument::ext_type ext
, size_t n
) {
247 const size_t m
= std::min(v
.size(), n
);
248 const bool sign_ext
= (ext
== module::argument::sign_ext
);
249 const uint8_t fill
= (sign_ext
&& msb(v
) ? ~0 : 0);
252 if (PIPE_ENDIAN_NATIVE
== PIPE_ENDIAN_LITTLE
)
253 std::copy_n(v
.begin(), m
, w
.begin());
255 std::copy_n(v
.end() - m
, m
, w
.end() - m
);
261 /// Append buffer \a w to \a v.
265 insert(T
&v
, const T
&w
) {
266 v
.insert(v
.end(), w
.begin(), w
.end());
270 /// Append \a n elements to the end of buffer \a v.
274 allocate(T
&v
, size_t n
) {
275 size_t pos
= v
.size();
281 kernel::argument::argument() : _set(false) {
285 kernel::argument::set() const {
290 kernel::argument::storage() const {
294 kernel::scalar_argument::scalar_argument(size_t size
) : size(size
) {
298 kernel::scalar_argument::set(size_t size
, const void *value
) {
299 if (size
!= this->size
)
300 throw error(CL_INVALID_ARG_SIZE
);
302 v
= { (uint8_t *)value
, (uint8_t *)value
+ size
};
307 kernel::scalar_argument::bind(exec_context
&ctx
,
308 const module::argument
&marg
) {
311 extend(w
, marg
.ext_type
, marg
.target_size
);
312 byteswap(w
, ctx
.q
->dev
.endianness());
313 align(ctx
.input
, marg
.target_align
);
314 insert(ctx
.input
, w
);
318 kernel::scalar_argument::unbind(exec_context
&ctx
) {
322 kernel::global_argument::set(size_t size
, const void *value
) {
323 if (size
!= sizeof(cl_mem
))
324 throw error(CL_INVALID_ARG_SIZE
);
326 buf
= &obj
<buffer
>(*(cl_mem
*)value
);
331 kernel::global_argument::bind(exec_context
&ctx
,
332 const module::argument
&marg
) {
333 align(ctx
.input
, marg
.target_align
);
334 ctx
.g_handles
.push_back(allocate(ctx
.input
, marg
.target_size
));
335 ctx
.g_buffers
.push_back(buf
->resource(*ctx
.q
).pipe
);
339 kernel::global_argument::unbind(exec_context
&ctx
) {
343 kernel::local_argument::storage() const {
348 kernel::local_argument::set(size_t size
, const void *value
) {
350 throw error(CL_INVALID_ARG_VALUE
);
357 kernel::local_argument::bind(exec_context
&ctx
,
358 const module::argument
&marg
) {
359 auto v
= bytes(ctx
.mem_local
);
361 extend(v
, module::argument::zero_ext
, marg
.target_size
);
362 byteswap(v
, ctx
.q
->dev
.endianness());
363 align(ctx
.input
, marg
.target_align
);
364 insert(ctx
.input
, v
);
366 ctx
.mem_local
+= _storage
;
370 kernel::local_argument::unbind(exec_context
&ctx
) {
374 kernel::constant_argument::set(size_t size
, const void *value
) {
375 if (size
!= sizeof(cl_mem
))
376 throw error(CL_INVALID_ARG_SIZE
);
378 buf
= &obj
<buffer
>(*(cl_mem
*)value
);
383 kernel::constant_argument::bind(exec_context
&ctx
,
384 const module::argument
&marg
) {
385 auto v
= bytes(ctx
.resources
.size() << 24);
387 extend(v
, module::argument::zero_ext
, marg
.target_size
);
388 byteswap(v
, ctx
.q
->dev
.endianness());
389 align(ctx
.input
, marg
.target_align
);
390 insert(ctx
.input
, v
);
392 st
= buf
->resource(*ctx
.q
).bind_surface(*ctx
.q
, false);
393 ctx
.resources
.push_back(st
);
397 kernel::constant_argument::unbind(exec_context
&ctx
) {
398 buf
->resource(*ctx
.q
).unbind_surface(*ctx
.q
, st
);
402 kernel::image_rd_argument::set(size_t size
, const void *value
) {
403 if (size
!= sizeof(cl_mem
))
404 throw error(CL_INVALID_ARG_SIZE
);
406 img
= &obj
<image
>(*(cl_mem
*)value
);
411 kernel::image_rd_argument::bind(exec_context
&ctx
,
412 const module::argument
&marg
) {
413 auto v
= bytes(ctx
.sviews
.size());
415 extend(v
, module::argument::zero_ext
, marg
.target_size
);
416 byteswap(v
, ctx
.q
->dev
.endianness());
417 align(ctx
.input
, marg
.target_align
);
418 insert(ctx
.input
, v
);
420 st
= img
->resource(*ctx
.q
).bind_sampler_view(*ctx
.q
);
421 ctx
.sviews
.push_back(st
);
425 kernel::image_rd_argument::unbind(exec_context
&ctx
) {
426 img
->resource(*ctx
.q
).unbind_sampler_view(*ctx
.q
, st
);
430 kernel::image_wr_argument::set(size_t size
, const void *value
) {
431 if (size
!= sizeof(cl_mem
))
432 throw error(CL_INVALID_ARG_SIZE
);
434 img
= &obj
<image
>(*(cl_mem
*)value
);
439 kernel::image_wr_argument::bind(exec_context
&ctx
,
440 const module::argument
&marg
) {
441 auto v
= bytes(ctx
.resources
.size());
443 extend(v
, module::argument::zero_ext
, marg
.target_size
);
444 byteswap(v
, ctx
.q
->dev
.endianness());
445 align(ctx
.input
, marg
.target_align
);
446 insert(ctx
.input
, v
);
448 st
= img
->resource(*ctx
.q
).bind_surface(*ctx
.q
, true);
449 ctx
.resources
.push_back(st
);
453 kernel::image_wr_argument::unbind(exec_context
&ctx
) {
454 img
->resource(*ctx
.q
).unbind_surface(*ctx
.q
, st
);
458 kernel::sampler_argument::set(size_t size
, const void *value
) {
459 if (size
!= sizeof(cl_sampler
))
460 throw error(CL_INVALID_ARG_SIZE
);
462 s
= &obj(*(cl_sampler
*)value
);
467 kernel::sampler_argument::bind(exec_context
&ctx
,
468 const module::argument
&marg
) {
469 st
= s
->bind(*ctx
.q
);
470 ctx
.samplers
.push_back(st
);
474 kernel::sampler_argument::unbind(exec_context
&ctx
) {
475 s
->unbind(*ctx
.q
, st
);