From 3503d5838ca43ca87921c807da9843ffdbeab768 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Thu, 5 Apr 2018 08:59:57 +0200 Subject: [PATCH 1/3] fix benchmarks for new slice vector syntax --- CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3813489..c81c1e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,9 +150,10 @@ endif() message("Found xsimd : ${xsimd_INCLUDE_DIRS}\n\n") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/profile_snip.cpp) - add_executable(profile_snip - src/profile_snip.cpp - ${XTENSOR_HEADERS}) + add_executable(profile_snip + EXCLUDE_FROM_ALL + src/profile_snip.cpp + ${XTENSOR_HEADERS}) endif() add_custom_target(xbenchmark From 5cc89c1c61197b0e075aaab6da59b28db58e3718 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Thu, 5 Apr 2018 09:01:15 +0200 Subject: [PATCH 2/3] include 15.05 in range --- src/benchmark_views.hpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/benchmark_views.hpp b/src/benchmark_views.hpp index a362ec7..7c2aea5 100644 --- a/src/benchmark_views.hpp +++ b/src/benchmark_views.hpp @@ -33,6 +33,10 @@ #define RANGE 128, 128 #define MULTIPLIER 8 +#define XTENSOR_VERSION (XTENSOR_VERSION_MAJOR * 10000 \ + + XTENSOR_VERSION_MINOR * 100 \ + + XTENSOR_VERSION_PATCH) + namespace xt { void xtensor_view(benchmark::State& state) @@ -64,7 +68,11 @@ namespace xt tensor a = random::rand({state.range(0), state.range(0)}); tensor b = random::rand({state.range(0), state.range(0)}); - auto sv = xt::slice_vector(a, range(0, 5), range(0, 5)); + #if XTENSOR_VERSION > 1505 + auto sv = xt::slice_vector{range(0, 5), range(0, 5)}; + #else + auto sv = xt::slice_vector(a, {range(0, 5), range(0, 5)}); + #endif auto av = xt::dynamic_view(a, sv); auto bv = xt::dynamic_view(b, sv); @@ -123,7 +131,11 @@ namespace xt tensor a = random::rand({state.range(0)}); tensor b = random::rand({state.range(0)}); + #if XTENSOR_VERSION > 1505 + auto sv = xt::slice_vector{range(0, state.range(0), 2)}; + #else auto sv = xt::slice_vector(a, range(0, state.range(0), 2)); + #endif auto av = xt::dynamic_view(a, sv); auto bv = xt::dynamic_view(b, sv); From 6bd6b96368b8d87b14b353bebe1a4b7469692824 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Thu, 5 Apr 2018 12:54:52 +0200 Subject: [PATCH 3/3] add manual broadcast loops, remove tabs, fix benchmarks --- CMakeLists.txt | 2 +- src/benchmark_broadcasting.hpp | 118 ++++++++++++--- src/benchmark_views.hpp | 260 ++++++++++++++++----------------- 3 files changed, 225 insertions(+), 155 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c81c1e2..16efc41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") CHECK_CXX_COMPILER_FLAG("-std=c++14" HAS_CPP14_FLAG) if (HAS_CPP14_FLAG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -ffast-math -march=native -std=c++14 -pthread") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Ofast -ffast-math -march=native -std=c++14 -pthread -Wno-narrowing") else() message(FATAL_ERROR "Unsupported compiler -- xtensor requires C++14 support!") endif() diff --git a/src/benchmark_broadcasting.hpp b/src/benchmark_broadcasting.hpp index 4aa7f92..448c16d 100644 --- a/src/benchmark_broadcasting.hpp +++ b/src/benchmark_broadcasting.hpp @@ -13,6 +13,7 @@ #include "xtensor/xrandom.hpp" #include "xtensor/xtensor.hpp" #include "xtensor/xarray.hpp" +#include "xtensor/xfixed.hpp" #ifdef HAS_PYTHONIC #include @@ -27,37 +28,106 @@ namespace xt { - void xtensor_broadcasting(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor3 = xtensor_container, 3, layout_type::row_major>; - using tensor2 = xtensor_container, 2, layout_type::row_major>; + void xtensor_broadcasting(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor3 = xtensor_container, 3, layout_type::row_major>; + using tensor2 = xtensor_container, 2, layout_type::row_major>; - tensor3 a = random::rand({state.range(0), state.range(0), state.range(0)}); - tensor2 b = random::rand({state.range(0), state.range(0)}); + tensor3 a = random::rand({state.range(0), state.range(0), state.range(0)}); + tensor2 b = random::rand({state.range(0), state.range(0)}); for (auto _ : state) - { - tensor3 res(a + b); - benchmark::DoNotOptimize(res.raw_data()); - } - } - BENCHMARK(xtensor_broadcasting)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + { + tensor3 res(a + b); + benchmark::DoNotOptimize(res.raw_data()); + } + } + BENCHMARK(xtensor_broadcasting)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + void xarray_broadcasting(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor3 = xarray_container, layout_type::row_major>; + using tensor2 = xarray_container, layout_type::row_major>; + + tensor3 a = random::rand({state.range(0), state.range(0), state.range(0)}); + tensor2 b = random::rand({state.range(0), state.range(0)}); + + for (auto _ : state) + { + tensor3 res(a + b); + benchmark::DoNotOptimize(res.raw_data()); + } + } + BENCHMARK(xarray_broadcasting)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + template + void manual_broadcast_xtensorf(benchmark::State& state) + { + auto a = xt::xtensorf>(); + auto b = xt::xtensorf>(); + for (auto _ : state) + { + auto c = xt::xtensorf>(); + for (std::size_t i = 0; i < a.shape()[0]; ++i) + for (std::size_t j = 0; j < a.shape()[1]; ++j) + for (std::size_t k = 0; k < a.shape()[2]; ++k) + c(i, j, k) = a(i, j, k) + b(i, j, k); + benchmark::DoNotOptimize(c.raw_data()); + } + } + BENCHMARK_TEMPLATE(manual_broadcast_xtensorf, 3); + BENCHMARK_TEMPLATE(manual_broadcast_xtensorf, 8); + BENCHMARK_TEMPLATE(manual_broadcast_xtensorf, 64); + + void manual_broadcast_xtensor(benchmark::State& state) + { + auto a = xt::xtensor::from_shape({state.range(0), state.range(0), state.range(0)}); + auto b = xt::xtensor::from_shape({state.range(0), state.range(0)}); + for (auto _ : state) + { + xt::xtensor c = xt::xtensor::from_shape({state.range(0), state.range(0), state.range(0)}); + for (std::size_t i = 0; i < a.shape()[0]; ++i) + for (std::size_t j = 0; j < a.shape()[1]; ++j) + for (std::size_t k = 0; k < a.shape()[2]; ++k) + c(i, j, k) = a(i, j, k) + b(i, j, k); + benchmark::DoNotOptimize(c.raw_data()); + } + } + BENCHMARK(manual_broadcast_xtensor)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + void manual_broadcast_xarray(benchmark::State& state) + { + auto a = xt::xarray::from_shape({state.range(0), state.range(0), state.range(0)}); + auto b = xt::xarray::from_shape({state.range(0), state.range(0)}); + for (auto _ : state) + { + xt::xarray c = xt::xarray::from_shape({state.range(0), state.range(0), state.range(0)}); + for (std::size_t i = 0; i < a.shape()[0]; ++i) + for (std::size_t j = 0; j < a.shape()[1]; ++j) + for (std::size_t k = 0; k < a.shape()[2]; ++k) + c(i, j, k) = a(i, j, k) + b(i, j, k); + benchmark::DoNotOptimize(c.raw_data()); + } + } + BENCHMARK(manual_broadcast_xarray)->RangeMultiplier(MULTIPLIER)->Range(RANGE); #ifdef HAS_PYTHONIC - void pythonic_broadcasting(benchmark::State& state) - { - auto x = pythonic::numpy::random::rand(state.range(0), state.range(0), state.range(0)); - auto y = pythonic::numpy::random::rand(state.range(0), state.range(0)); + void pythonic_broadcasting(benchmark::State& state) + { + auto x = pythonic::numpy::random::rand(state.range(0), state.range(0), state.range(0)); + auto y = pythonic::numpy::random::rand(state.range(0), state.range(0)); for (auto _ : state) - { - pythonic::types::ndarray z = x + y; - benchmark::DoNotOptimize(z.fbegin()); - } - } - BENCHMARK(pythonic_broadcasting)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + { + pythonic::types::ndarray z = x + y; + benchmark::DoNotOptimize(z.fbegin()); + } + } + BENCHMARK(pythonic_broadcasting)->RangeMultiplier(MULTIPLIER)->Range(RANGE); #endif } diff --git a/src/benchmark_views.hpp b/src/benchmark_views.hpp index 7c2aea5..ef5c9b7 100644 --- a/src/benchmark_views.hpp +++ b/src/benchmark_views.hpp @@ -39,34 +39,34 @@ namespace xt { - void xtensor_view(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor = xtensor_container, 2, layout_type::row_major>; + void xtensor_view(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor = xtensor_container, 2, layout_type::row_major>; - tensor a = random::rand({state.range(0), state.range(0)}); - tensor b = random::rand({state.range(0), state.range(0)}); + tensor a = random::rand({state.range(0), state.range(0)}); + tensor b = random::rand({state.range(0), state.range(0)}); auto av = xt::view(a, range(0, 5), range(0, 5)); auto bv = xt::view(b, range(0, 5), range(0, 5)); - for (auto _ : state) - { - tensor res(av + bv); - benchmark::DoNotOptimize(res.raw_data()); - } - } - BENCHMARK(xtensor_view)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + for (auto _ : state) + { + tensor res(av + bv); + benchmark::DoNotOptimize(res.raw_data()); + } + } + BENCHMARK(xtensor_view)->RangeMultiplier(MULTIPLIER)->Range(RANGE); - void xtensor_dynamicview(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor = xtensor_container, 2, layout_type::row_major>; + void xtensor_dynamicview(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor = xtensor_container, 2, layout_type::row_major>; - tensor a = random::rand({state.range(0), state.range(0)}); - tensor b = random::rand({state.range(0), state.range(0)}); + tensor a = random::rand({state.range(0), state.range(0)}); + tensor b = random::rand({state.range(0), state.range(0)}); #if XTENSOR_VERSION > 1505 auto sv = xt::slice_vector{range(0, 5), range(0, 5)}; @@ -77,59 +77,59 @@ namespace xt auto av = xt::dynamic_view(a, sv); auto bv = xt::dynamic_view(b, sv); - for (auto _ : state) - { - tensor res(av + bv); - benchmark::DoNotOptimize(res.raw_data()); - } - } - BENCHMARK(xtensor_dynamicview)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + for (auto _ : state) + { + tensor res(av + bv); + benchmark::DoNotOptimize(res.raw_data()); + } + } + BENCHMARK(xtensor_dynamicview)->RangeMultiplier(MULTIPLIER)->Range(RANGE); #ifdef HAS_EIGEN - void eigen_view(benchmark::State& state) - { - using namespace Eigen; - MatrixXd a = MatrixXd::Random(state.range(0), state.range(0)); - MatrixXd b = MatrixXd::Random(state.range(0), state.range(0)); + void eigen_view(benchmark::State& state) + { + using namespace Eigen; + MatrixXd a = MatrixXd::Random(state.range(0), state.range(0)); + MatrixXd b = MatrixXd::Random(state.range(0), state.range(0)); auto av = a.topLeftCorner(5, 5); auto bv = b.topLeftCorner(5, 5); - for (auto _ : state) - { - MatrixXd res(5, 5); - res.noalias() = av + bv; - benchmark::DoNotOptimize(res.data()); - } - } - BENCHMARK(eigen_view)->RangeMultiplier(MULTIPLIER)->Range(RANGE); - - void eigen_map(benchmark::State& state) - { - using namespace Eigen; - MatrixXd a = VectorXd::Random(state.range(0)); - MatrixXd b = VectorXd::Random(state.range(0)); + for (auto _ : state) + { + MatrixXd res(5, 5); + res.noalias() = av + bv; + benchmark::DoNotOptimize(res.data()); + } + } + BENCHMARK(eigen_view)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + void eigen_map(benchmark::State& state) + { + using namespace Eigen; + MatrixXd a = VectorXd::Random(state.range(0)); + MatrixXd b = VectorXd::Random(state.range(0)); auto av = Map>(a.data(), a.size() / 2); auto bv = Map>(b.data(), b.size() / 2); - for (auto _ : state) - { - VectorXd res(av + bv); - benchmark::DoNotOptimize(res.data()); - } - } - BENCHMARK(eigen_map)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + for (auto _ : state) + { + VectorXd res(av + bv); + benchmark::DoNotOptimize(res.data()); + } + } + BENCHMARK(eigen_map)->RangeMultiplier(MULTIPLIER)->Range(RANGE); #endif - void xtensor_stride_2(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor = xtensor_container, 1, layout_type::row_major>; + void xtensor_stride_2(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor = xtensor_container, 1, layout_type::row_major>; - tensor a = random::rand({state.range(0)}); - tensor b = random::rand({state.range(0)}); + tensor a = random::rand({state.range(0)}); + tensor b = random::rand({state.range(0)}); #if XTENSOR_VERSION > 1505 auto sv = xt::slice_vector{range(0, state.range(0), 2)}; @@ -140,75 +140,75 @@ namespace xt auto av = xt::dynamic_view(a, sv); auto bv = xt::dynamic_view(b, sv); - for (auto _ : state) - { - tensor res(av + bv); - benchmark::DoNotOptimize(res.data()); - } - } - BENCHMARK(xtensor_stride_2)->RangeMultiplier(MULTIPLIER)->Range(RANGE); - - void xtensor_max_speed(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor = xtensor_container, 1, layout_type::row_major>; - - tensor a = random::rand({state.range(0) / 2}); - tensor b = random::rand({state.range(0) / 2}); - - for (auto _ : state) - { - tensor res(a + b); - benchmark::DoNotOptimize(res.data()); - } - } - BENCHMARK(xtensor_max_speed)->RangeMultiplier(MULTIPLIER)->Range(RANGE); - - void xtensor_adapt_view(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor = xtensor_container, 1, layout_type::row_major>; - - tensor a = random::rand({state.range(0)}); - tensor b = random::rand({state.range(0)}); - std::size_t range_arg = static_cast(state.range(0)); - std::array shape = {range_arg / 2}; - std::array stride = {2}; - auto av = xt::adapt(std::move(a.data()), shape, stride); - auto bv = xt::adapt(std::move(b.data()), shape, stride); - - for (auto _ : state) - { - tensor res(av + bv); - benchmark::DoNotOptimize(res.data()); - } - } - BENCHMARK(xtensor_adapt_view)->RangeMultiplier(MULTIPLIER)->Range(RANGE); - - void xtensor_hand_loop(benchmark::State& state) - { - using namespace xt; - using allocator = xsimd::aligned_allocator; - using tensor = xtensor_container, 1, layout_type::row_major>; - - tensor a = random::rand({state.range(0)}); - tensor b = random::rand({state.range(0)}); - std::array shape = {static_cast(state.range(0)) / 2}; - for (auto _ : state) - { - tensor res(shape); - std::size_t j = 0; - for (std::size_t i = 0; i < state.range(0); i += 2) - { - res(j) = a(i) + b(i); - ++j; - } - benchmark::DoNotOptimize(res.data()); - } - } - BENCHMARK(xtensor_hand_loop)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + for (auto _ : state) + { + tensor res(av + bv); + benchmark::DoNotOptimize(res.data()); + } + } + BENCHMARK(xtensor_stride_2)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + void xtensor_max_speed(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor = xtensor_container, 1, layout_type::row_major>; + + tensor a = random::rand({state.range(0) / 2}); + tensor b = random::rand({state.range(0) / 2}); + + for (auto _ : state) + { + tensor res(a + b); + benchmark::DoNotOptimize(res.data()); + } + } + BENCHMARK(xtensor_max_speed)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + void xtensor_adapt_view(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor = xtensor_container, 1, layout_type::row_major>; + + tensor a = random::rand({state.range(0)}); + tensor b = random::rand({state.range(0)}); + std::size_t range_arg = static_cast(state.range(0)); + std::array shape = {range_arg / 2}; + std::array stride = {2}; + auto av = xt::adapt(std::move(a.data()), shape, stride); + auto bv = xt::adapt(std::move(b.data()), shape, stride); + + for (auto _ : state) + { + tensor res(av + bv); + benchmark::DoNotOptimize(res.data()); + } + } + BENCHMARK(xtensor_adapt_view)->RangeMultiplier(MULTIPLIER)->Range(RANGE); + + void xtensor_hand_loop(benchmark::State& state) + { + using namespace xt; + using allocator = xsimd::aligned_allocator; + using tensor = xtensor_container, 1, layout_type::row_major>; + + tensor a = random::rand({state.range(0)}); + tensor b = random::rand({state.range(0)}); + std::array shape = {static_cast(state.range(0)) / 2}; + for (auto _ : state) + { + tensor res(shape); + std::size_t j = 0; + for (std::size_t i = 0; i < state.range(0); i += 2) + { + res(j) = a(i) + b(i); + ++j; + } + benchmark::DoNotOptimize(res.data()); + } + } + BENCHMARK(xtensor_hand_loop)->RangeMultiplier(MULTIPLIER)->Range(RANGE); }