From bf045bf9b409c47019fa7d9c859eaf8d50dd7032 Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Mon, 4 Nov 2013 11:26:13 -0800 Subject: [PATCH] clover: Calculate optimal work group size when it's not specified by the user. Inspired by a patch sent to the mailing list by Tom Stellard, but using a different algorithm to calculate the optimal block size that has been found to be considerably more effective. Reviewed-by: Tom Stellard --- .../state_trackers/clover/Makefile.sources | 1 + .../state_trackers/clover/api/kernel.cpp | 41 ++++-- .../state_trackers/clover/core/kernel.cpp | 11 +- .../state_trackers/clover/core/kernel.hpp | 7 +- .../state_trackers/clover/util/factor.hpp | 131 ++++++++++++++++++ 5 files changed, 176 insertions(+), 15 deletions(-) create mode 100644 src/gallium/state_trackers/clover/util/factor.hpp diff --git a/src/gallium/state_trackers/clover/Makefile.sources b/src/gallium/state_trackers/clover/Makefile.sources index e55167733a4..520f52f133d 100644 --- a/src/gallium/state_trackers/clover/Makefile.sources +++ b/src/gallium/state_trackers/clover/Makefile.sources @@ -4,6 +4,7 @@ CPP_SOURCES := \ util/algorithm.hpp \ util/compat.cpp \ util/compat.hpp \ + util/factor.hpp \ util/functional.hpp \ util/lazy.hpp \ util/pointer.hpp \ diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp index f2f19690de7..33673633c6d 100644 --- a/src/gallium/state_trackers/clover/api/kernel.cpp +++ b/src/gallium/state_trackers/clover/api/kernel.cpp @@ -159,7 +159,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev, break; case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: - buf.as_vector() = kern.block_size(); + buf.as_vector() = kern.required_block_size(); break; case CL_KERNEL_LOCAL_MEM_SIZE: @@ -220,6 +220,24 @@ namespace { if (!d_grid_size || any_of(is_zero(), grid_size)) throw error(CL_INVALID_GLOBAL_WORK_SIZE); + return grid_size; + } + + std::vector + validate_grid_offset(const command_queue &q, cl_uint dims, + const size_t *d_grid_offset) { + if (d_grid_offset) + return range(d_grid_offset, dims); + else + return std::vector(dims, 0); + } + + std::vector + validate_block_size(const command_queue &q, const kernel &kern, + cl_uint dims, const size_t *d_grid_size, + const size_t *d_block_size) { + auto grid_size = range(d_grid_size, dims); + if (d_block_size) { auto block_size = range(d_block_size, dims); @@ -233,15 +251,12 @@ namespace { if (fold(multiplies(), 1u, block_size) > q.dev.max_threads_per_block()) throw error(CL_INVALID_WORK_GROUP_SIZE); - } - } - std::vector - pad_vector(const size_t *p, unsigned n, size_t x) { - if (p) - return { p, p + n }; - else - return { n, x }; + return block_size; + + } else { + return kern.optimal_block_size(q, grid_size); + } } } @@ -254,13 +269,13 @@ clEnqueueNDRangeKernel(cl_command_queue d_q, cl_kernel d_kern, auto &q = obj(d_q); auto &kern = obj(d_kern); auto deps = objs(d_deps, num_deps); + auto grid_size = validate_grid_size(q, dims, d_grid_size); + auto grid_offset = validate_grid_offset(q, dims, d_grid_offset); + auto block_size = validate_block_size(q, kern, dims, + d_grid_size, d_block_size); validate_common(q, kern, deps); - validate_grid(q, dims, d_grid_size, d_block_size); - auto grid_offset = pad_vector(d_grid_offset, dims, 0); - auto grid_size = pad_vector(d_grid_size, dims, 1); - auto block_size = pad_vector(d_block_size, dims, 1); hard_event *hev = new hard_event( q, CL_COMMAND_NDRANGE_KERNEL, deps, [=, &kern, &q](event &) { diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp index 4670046ef93..58780d6d03c 100644 --- a/src/gallium/state_trackers/clover/core/kernel.cpp +++ b/src/gallium/state_trackers/clover/core/kernel.cpp @@ -22,6 +22,7 @@ #include "core/kernel.hpp" #include "core/resource.hpp" +#include "util/factor.hpp" #include "util/u_math.h" #include "pipe/p_context.h" @@ -126,7 +127,15 @@ kernel::name() const { } std::vector -kernel::block_size() const { +kernel::optimal_block_size(const command_queue &q, + const std::vector &grid_size) const { + return factor::find_grid_optimal_factor( + q.dev.max_threads_per_block(), q.dev.max_block_size(), + grid_size); +} + +std::vector +kernel::required_block_size() const { return { 0, 0, 0 }; } diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp index 4bcc3c76891..f42e199fb13 100644 --- a/src/gallium/state_trackers/clover/core/kernel.hpp +++ b/src/gallium/state_trackers/clover/core/kernel.hpp @@ -121,7 +121,12 @@ namespace clover { size_t mem_private() const; const std::string &name() const; - std::vector block_size() const; + + std::vector + optimal_block_size(const command_queue &q, + const std::vector &grid_size) const; + std::vector + required_block_size() const; argument_range args(); const_argument_range args() const; diff --git a/src/gallium/state_trackers/clover/util/factor.hpp b/src/gallium/state_trackers/clover/util/factor.hpp new file mode 100644 index 00000000000..76d3bfe343f --- /dev/null +++ b/src/gallium/state_trackers/clover/util/factor.hpp @@ -0,0 +1,131 @@ +// +// Copyright 2013 Francisco Jerez +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#ifndef CLOVER_UTIL_FACTOR_HPP +#define CLOVER_UTIL_FACTOR_HPP + +#include "util/range.hpp" + +namespace clover { + namespace factor { + /// + /// Calculate all prime integer factors of \p x. + /// + /// If \p limit is non-zero, terminate early as soon as enough + /// factors have been collected to reach the product \p limit. + /// + template + std::vector + find_integer_prime_factors(T x, T limit = 0) + { + const T max_d = (limit > 0 && limit < x ? limit : x); + const T min_x = x / max_d; + std::vector factors; + + for (T d = 2; d <= max_d && x > min_x; d++) { + if (x % d == 0) { + for (; x % d == 0; x /= d); + factors.push_back(d); + } + } + + return factors; + } + + namespace detail { + /// + /// Walk the power set of prime factors of the n-dimensional + /// integer array \p grid subject to the constraints given by + /// \p limits. + /// + template + std::pair> + next_grid_factor(const std::pair> &limits, + const std::vector &grid, + const std::vector> &factors, + std::pair> block, + unsigned d = 0, unsigned i = 0) { + if (d >= factors.size()) { + // We're done. + return {}; + + } else if (i >= factors[d].size()) { + // We're done with this grid dimension, try the next. + return next_grid_factor(limits, grid, factors, + std::move(block), d + 1, 0); + + } else { + T f = factors[d][i]; + + // Try the next power of this factor. + block.first *= f; + block.second[d] *= f; + + if (block.first <= limits.first && + block.second[d] <= limits.second[d] && + grid[d] % block.second[d] == 0) { + // We've found a valid grid divisor. + return block; + + } else { + // Overflow, back off to the zeroth power, + while (block.second[d] % f == 0) { + block.second[d] /= f; + block.first /= f; + } + + // ...and carry to the next factor. + return next_grid_factor(limits, grid, factors, + std::move(block), d, i + 1); + } + } + } + } + + /// + /// Find the divisor of the integer array \p grid that gives the + /// highest possible product not greater than \p product_limit + /// subject to the constraints given by \p coord_limit. + /// + template + std::vector + find_grid_optimal_factor(T product_limit, + const std::vector &coord_limit, + const std::vector &grid) { + const std::vector> factors = + map(find_integer_prime_factors, grid, coord_limit); + const auto limits = std::make_pair(product_limit, coord_limit); + auto best = std::make_pair(T(1), std::vector(grid.size(), T(1))); + + for (auto block = best; + block.first != 0 && best.first != product_limit; + block = detail::next_grid_factor(limits, grid, factors, block)) { + if (block.first > best.first) + best = block; + } + + return best.second; + } + } +} + +#endif -- 2.30.2