243 lines
14 KiB
C++
243 lines
14 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Mehdi Goli Codeplay Software Ltd.
|
|
// Ralph Potter Codeplay Software Ltd.
|
|
// Luke Iwanski Codeplay Software Ltd.
|
|
// Contact: <eigen@codeplay.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at the mozilla.org home page
|
|
|
|
/*****************************************************************
|
|
* TensorSyclPlaceHolderExpr.h
|
|
*
|
|
* \brief:
|
|
* This is the specialisation of the placeholder expression based on the
|
|
* operation type
|
|
*
|
|
*****************************************************************/
|
|
|
|
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
|
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|
|
|
|
namespace Eigen {
|
|
namespace internal {
|
|
|
|
template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
|
|
template<typename BufferTOut, typename BufferTIn>
|
|
static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
|
|
do {
|
|
auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
|
|
cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
|
|
cl::sycl::range<1>{std::min(length, local)}};
|
|
/* Two accessors are used: one to the buffer that is being reduced,
|
|
* and a second to local memory, used to store intermediate data. */
|
|
auto aI =
|
|
bufI.template get_access<cl::sycl::access::mode::read_write>(h);
|
|
auto aOut =
|
|
bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
|
|
cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
|
|
cl::sycl::access::target::local>
|
|
scratch(cl::sycl::range<1>(local), h);
|
|
|
|
/* The parallel_for invocation chosen is the variant with an nd_item
|
|
* parameter, since the code requires barriers for correctness. */
|
|
h.parallel_for<KernelName>(
|
|
r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
|
|
size_t globalid = id.get_global(0);
|
|
size_t localid = id.get_local(0);
|
|
/* All threads collectively read from global memory into local.
|
|
* The barrier ensures all threads' IO is resolved before
|
|
* execution continues (strictly speaking, all threads within
|
|
* a single work-group - there is no co-ordination between
|
|
* work-groups, only work-items). */
|
|
if (globalid < length) {
|
|
scratch[localid] = aI[globalid];
|
|
}
|
|
id.barrier(cl::sycl::access::fence_space::local_space);
|
|
|
|
/* Apply the reduction operation between the current local
|
|
* id and the one on the other half of the vector. */
|
|
if (globalid < length) {
|
|
int min = (length < local) ? length : local;
|
|
for (size_t offset = min / 2; offset > 0; offset /= 2) {
|
|
if (localid < offset) {
|
|
scratch[localid] += scratch[localid + offset];
|
|
}
|
|
id.barrier(cl::sycl::access::fence_space::local_space);
|
|
}
|
|
/* The final result will be stored in local id 0. */
|
|
if (localid == 0) {
|
|
aI[id.get_group(0)] = scratch[localid];
|
|
if((length<=local) && globalid ==0){
|
|
aOut[globalid]=scratch[localid];
|
|
}
|
|
}
|
|
}
|
|
});
|
|
};
|
|
dev.m_queue.submit(f);
|
|
dev.m_queue.throw_asynchronous();
|
|
|
|
/* At this point, you could queue::wait_and_throw() to ensure that
|
|
* errors are caught quickly. However, this would likely impact
|
|
* performance negatively. */
|
|
length = length / local;
|
|
|
|
} while (length > 1);
|
|
|
|
|
|
|
|
}
|
|
|
|
};
|
|
|
|
/// For now let's start with a full reducer
|
|
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
|
|
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
|
|
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
|
|
// a leafNode.
|
|
template <typename Self, typename Op, bool Vectorizable>
|
|
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
|
|
|
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
|
static const bool HasOptimizedImplementation = false;
|
|
|
|
static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
|
|
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
|
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
|
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
|
int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
|
|
size_t inputSize =self.impl().dimensions().TotalSize();
|
|
size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
|
|
size_t remaining = inputSize% red_factor;
|
|
if(rng ==0) {
|
|
red_factor=1;
|
|
};
|
|
size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
|
|
size_t GRange=std::max((size_t )1, rng);
|
|
|
|
// convert global range to power of 2 for redecution
|
|
GRange--;
|
|
GRange |= GRange >> 1;
|
|
GRange |= GRange >> 2;
|
|
GRange |= GRange >> 4;
|
|
GRange |= GRange >> 8;
|
|
GRange |= GRange >> 16;
|
|
#if __x86_64__ || __ppc64__ || _WIN64
|
|
GRange |= GRange >> 32;
|
|
#endif
|
|
GRange++;
|
|
size_t outTileSize = tileSize;
|
|
/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
|
|
if (GRange < outTileSize) outTileSize=GRange;
|
|
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
|
auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
|
|
/// creating the shared memory for calculating reduction.
|
|
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
|
/// recursively apply reduction on it in order to reduce the whole.
|
|
auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
|
|
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
|
Dims dims= self.xprDims();
|
|
Op functor = reducer;
|
|
dev.m_queue.submit([&](cl::sycl::handler &cgh) {
|
|
// create a tuple of accessors from Evaluator
|
|
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
|
auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
|
|
|
|
cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
|
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
|
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
|
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
|
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
|
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
|
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
|
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
|
/// the device_evaluator is detectable and recognisable on the device.
|
|
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
|
/// const cast added as a naive solution to solve the qualifier drop error
|
|
auto globalid=itemID.get_global_linear_id();
|
|
|
|
if(globalid<rng)
|
|
tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
|
|
else
|
|
tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
|
|
|
|
if(remaining!=0 && globalid==0 )
|
|
// this will add the rest of input buffer when the input size is not devidable to red_factor.
|
|
tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
|
|
});
|
|
});
|
|
dev.m_queue.throw_asynchronous();
|
|
|
|
/// This is used to recursively reduce the tmp value to an element of 1;
|
|
syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
|
|
}
|
|
|
|
};
|
|
|
|
template <typename Self, typename Op>
|
|
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
|
|
|
|
typedef typename Self::CoeffReturnType CoeffReturnType;
|
|
static const bool HasOptimizedImplementation = false;
|
|
|
|
static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
|
|
typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
|
|
typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
|
|
auto functors = TensorSycl::internal::extractFunctors(self.impl());
|
|
|
|
size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
|
|
|
|
size_t GRange=num_coeffs_to_preserve;
|
|
if (tileSize>GRange) tileSize=GRange;
|
|
else if(GRange>tileSize){
|
|
size_t xMode = GRange % tileSize;
|
|
if (xMode != 0) GRange += (tileSize - xMode);
|
|
}
|
|
// getting final out buffer at the moment the created buffer is true because there is no need for assign
|
|
/// creating the shared memory for calculating reduction.
|
|
/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
|
|
/// recursively apply reduction on it in order to reduce the whole.
|
|
typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
|
|
Dims dims= self.xprDims();
|
|
Op functor = reducer;
|
|
|
|
dev.m_queue.submit([&](cl::sycl::handler &cgh) {
|
|
// create a tuple of accessors from Evaluator
|
|
auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
|
|
auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
|
|
|
|
cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
|
|
typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
|
|
auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
|
|
/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
|
|
/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
|
|
/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
|
|
const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
|
|
/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
|
|
/// the device_evaluator is detectable and recognisable on the device.
|
|
typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
|
|
auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
|
|
/// const cast added as a naive solution to solve the qualifier drop error
|
|
auto globalid=itemID.get_global_linear_id();
|
|
if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
|
|
typename DeiceSelf::CoeffReturnType accum = functor.initialize();
|
|
GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
|
|
functor.finalize(accum);
|
|
output_accessor.get_pointer()[globalid]= accum;
|
|
}
|
|
});
|
|
});
|
|
dev.m_queue.throw_asynchronous();
|
|
return false;
|
|
}
|
|
};
|
|
|
|
} // end namespace internal
|
|
} // namespace Eigen
|
|
|
|
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
|