commit 8ecf1014665dd83abec7c6ff0395df98926221b3
parent 3a93ad1530eda7d01df6d8d68cd26b185ab12b69
Author: Vincent Forest <vincent.forest@meso-star.com>
Date: Thu, 29 Apr 2021 14:34:43 +0200
Optimise the ssf_phase_rdgfa_setup function with simd<4|8>
Diffstat:
5 files changed, 367 insertions(+), 27 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -13,7 +13,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-cmake_minimum_required(VERSION 2.8)
+cmake_minimum_required(VERSION 3.1)
project(ssf C)
enable_testing()
@@ -27,13 +27,61 @@ set(SSF_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../src)
find_package(RCMake 0.4 REQUIRED)
find_package(RSys 0.6 REQUIRED)
find_package(StarSP 0.5 REQUIRED)
-include_directories(${RSys_INCLUDE_DIR} ${StarSP_INCLUDE_DIR})
+find_package(RSIMD 0.3)
+if(RSIMD_FOUND)
+ option(USE_SIMD "Use SIMD instruction sets" ON)
+endif()
set(CMAKE_MODULE_PATH ${RCMAKE_SOURCE_DIR})
include(rcmake)
include(rcmake_runtime)
-rcmake_append_runtime_dirs(_runtime_dirs RSys StarSP)
+include_directories(${RSys_INCLUDE_DIR} ${StarSP_INCLUDE_DIR})
+
+set(_dependencies RSys StarSP)
+
+if(USE_SIMD)
+ include_directories(${RSIMD_INCLUDE_DIR})
+ set(_dependencies ${_dependencies} RSIMD)
+endif()
+
+rcmake_append_runtime_dirs(_runtime_dirs ${_dependencies})
+
+################################################################################
+# Check SIMD support
+################################################################################
+if(NOT USE_SIMD)
+ message(STATUS "Do not use the SIMD instruction sets")
+else()
+ message(STATUS "Use the SIMD instruction sets")
+
+ option(SSF_USE_SIMD_128 "Enable the SIMD-128 instruction sets" ON)
+ option(SSF_USE_SIMD_256 "Enable the SIMD-256 instruction sets" ON)
+
+ if(CMAKE_COMPILER_IS_GNUCC)
+ include(CheckCCompilerFlag)
+ CHECK_C_COMPILER_FLAG("-msse2" SSE2)
+ CHECK_C_COMPILER_FLAG("-msse4.1" SSE4_1)
+ CHECK_C_COMPILER_FLAG("-mavx" AVX)
+ endif()
+
+ if(SSF_USE_SIMD_128 AND NOT SSE2)
+ get_property(_docstring CACHE SSF_USE_SIMD_128 PROPERTY HELPSTRING)
+ set(SSF_USE_SIMD_128 OFF CACHE BOOL ${_docstring} FORCE)
+ endif()
+ if(SSF_USE_SIMD_256 AND NOT AVX)
+ get_property(_docstring CACHE SSF_USE_SIMD_256 PROPERTY HELPSTRING)
+ set(SSF_USE_SIMD_128 OFF CACHE BOOL ${_docstring} FORCE)
+ endif()
+
+ if(SSF_USE_SIMD_128)
+ set(_simd_size "${_simd_size};SSF_USE_SIMD_128")
+ endif()
+
+ if(SSF_USE_SIMD_256)
+ set(_simd_size "${_simd_size};SSF_USE_SIMD_256")
+ endif()
+endif()
################################################################################
# Define targets
@@ -77,20 +125,16 @@ rcmake_prepend_path(SSF_FILES_INC_API ${SSF_SOURCE_DIR})
rcmake_prepend_path(SSF_FILES_DOC ${PROJECT_SOURCE_DIR}/../)
if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_GNUCC)
- set(MATH_LIB m)
+ set(_dependencies ${_dependencies} m)
endif()
-if(BUILD_STATIC)
- add_library(ssf STATIC ${SSF_FILES_INC} ${SSF_FILES_SRC} ${SSF_FILES_INC_API})
- set_target_properties(ssf PROPERTIES COMPILE_DEFINITIONS SSF_STATIC_BUILD)
-else()
- add_library(ssf SHARED ${SSF_FILES_INC} ${SSF_FILES_SRC})
- target_link_libraries(ssf RSys StarSP ${MATH_LIB})
- set_target_properties(ssf PROPERTIES
- DEFINE_SYMBOL SSF_SHARED_BUILD
- VERSION ${VERSION}
- SOVERSION ${VERSION_MAJOR})
-endif()
+add_library(ssf SHARED ${SSF_FILES_INC} ${SSF_FILES_SRC})
+target_link_libraries(ssf ${_dependencies})
+set_target_properties(ssf PROPERTIES
+ DEFINE_SYMBOL SSF_SHARED_BUILD
+ COMPILE_DEFINITIONS "${_simd_size}"
+ VERSION ${VERSION}
+ SOVERSION ${VERSION_MAJOR})
rcmake_setup_devel(ssf StarSF ${VERSION} star/ssf_version.h)
@@ -99,10 +143,18 @@ rcmake_setup_devel(ssf StarSF ${VERSION} star/ssf_version.h)
################################################################################
if(NOT NO_TEST)
- function(new_test _name)
+ function(build_test _name)
add_executable(${_name} ${SSF_SOURCE_DIR}/${_name}.c)
target_link_libraries(${_name} ssf RSys StarSP ${MATH_LIB})
- add_test(${_name} ${_name})
+ endfunction()
+
+ function(register_test _name)
+ add_test(${_name} ${ARGN})
+ endfunction()
+
+ function(new_test _name)
+ build_test(${_name} ${ARGN})
+ register_test(${_name} ${_name})
endfunction()
new_test(test_ssf_beckmann_distribution)
@@ -120,11 +172,19 @@ if(NOT NO_TEST)
new_test(test_ssf_phase)
new_test(test_ssf_phase_hg)
new_test(test_ssf_phase_rayleigh)
- new_test(test_ssf_phase_rdgfa)
new_test(test_ssf_specular_dielectric_dielectric_reflection)
new_test(test_ssf_specular_reflection)
new_test(test_ssf_thin_specular_dielectric)
+ build_test(test_ssf_phase_rdgfa)
+ register_test(test_ssf_phase_rdgfa_simd_none test_ssf_phase_rdgfa simd_none)
+ if(SSF_USE_SIMD_128)
+ register_test(test_ssf_phase_rdgfa_simd_128 test_ssf_phase_rdgfa simd_128)
+ endif()
+ if(SSF_USE_SIMD_256)
+ register_test(test_ssf_phase_rdgfa_simd_256 test_ssf_phase_rdgfa simd_256)
+ endif()
+
rcmake_copy_runtime_libraries(test_ssf_beckmann_distribution)
endif()
diff --git a/src/ssf.h b/src/ssf.h
@@ -57,6 +57,12 @@ enum ssf_bxdf_flag {
SSF_GLOSSY = BIT(4)
};
+enum ssf_simd {
+ SSF_SIMD_NONE,
+ SSF_SIMD_128,
+ SSF_SIMD_256
+};
+
/* Generic BSDF type descriptor. Note that by convention the outgoing direction
* `wo' and the incoming direction `wi' point outward the surface. Furthermore,
* `wo' and the normal N must point on the same side of the surface. As a
@@ -192,10 +198,12 @@ struct ssf_phase_rdgfa_setup_args {
double fractal_dimension; /* No unit */
double gyration_radius; /* In nm */
- /* #intervals used to discretized the angular domain */
+ /* Number of #intervals to use to discretize the angular domain */
size_t nintervals;
+
+ enum ssf_simd simd; /* SIMD instruction sets to use */
};
-#define SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT__ {0,0,0,1000}
+#define SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT__ {0,0,0,1000,SSF_SIMD_NONE}
static const struct ssf_phase_rdgfa_setup_args
SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT = SSF_PHASE_RDGFA_SETUP_ARGS_DEFAULT__;
diff --git a/src/ssf_phase_rdgfa.c b/src/ssf_phase_rdgfa.c
@@ -21,6 +21,26 @@
#include <rsys/algorithm.h>
#include <rsys/dynamic_array_double.h>
+#if defined(SSF_USE_SIMD_128) || defined(SSF_USE_SIMD_256)
+ /* Helper macro */
+ #define USE_SIMD
+#endif
+
+#ifdef USE_SIMD
+ #include <rsimd/math.h>
+ #include <rsimd/rsimd.h>
+
+ /* Generate the struct darray_simdf dynamic array */
+ #define DARRAY_NAME simdf
+ #define DARRAY_DATA float
+ #ifdef SSF_USE_SIMD_256
+ #define DARRAY_ALIGNMENT 64
+ #else
+ #define DARRAY_ALIGNMENT 16
+ #endif
+ #include <rsys/dynamic_array.h>
+#endif
+
#define EXP1 2.7182818284590452354
struct rdgfa {
@@ -30,9 +50,14 @@ struct rdgfa {
double rcp_normalize_factor; /* Reciprocal normalization factor of the CDF */
- /* Discretized cumulative (#entries = nintervals[0] + nintervals[1]) */
+ /* Discretized cumulative (#entries = nintervals) */
struct darray_double cdf;
+#ifdef USE_SIMD
+ struct darray_simdf f_list;
+ struct darray_simdf d_omega_list;
+#endif
+
/* Precomputed values */
double Rg2; /* gyration_radius^2 */
double cst_4pi_div_lambda; /* (4*PI)/wavelength */
@@ -115,7 +140,6 @@ eval2
return phase;
}
-
/* Not normalized */
static FINLINE double
eval(struct rdgfa* rdgfa, const double theta)
@@ -123,7 +147,7 @@ eval(struct rdgfa* rdgfa, const double theta)
return eval2(rdgfa, theta, cos(theta));
}
-static res_T
+static INLINE res_T
compute_cumulative(struct rdgfa* rdgfa)
{
double* cdf = NULL;
@@ -131,6 +155,7 @@ compute_cumulative(struct rdgfa* rdgfa)
double theta1;
size_t i;
res_T res = RES_OK;
+ ASSERT(rdgfa);
/* Allocate the cumulative array */
res = darray_double_resize(&rdgfa->cdf, rdgfa->nintervals);
@@ -172,6 +197,16 @@ error:
goto exit;
}
+/* Generate the simd functions if required */
+#ifdef SSF_USE_SIMD_128
+ #define SIMD_WIDTH__ 4
+ #include "ssf_phase_rdgfa_simdX.h"
+#endif
+#ifdef SSF_USE_SIMD_256
+ #define SIMD_WIDTH__ 8
+ #include "ssf_phase_rdgfa_simdX.h"
+#endif
+
/*******************************************************************************
* Private functions
******************************************************************************/
@@ -182,6 +217,10 @@ rdgfa_init(struct mem_allocator* allocator, void* phase)
ASSERT(phase);
memset(rdgfa, 0, sizeof(*rdgfa));
darray_double_init(allocator, &rdgfa->cdf);
+#ifdef USE_SIMD
+ darray_simdf_init(allocator, &rdgfa->f_list);
+ darray_simdf_init(allocator, &rdgfa->d_omega_list);
+#endif /* USE_SIMD */
return RES_OK;
}
@@ -191,6 +230,10 @@ rdgfa_release(void* phase)
struct rdgfa* rdgfa = phase;
ASSERT(phase);
darray_double_release(&rdgfa->cdf);
+#ifdef USE_SIMD
+ darray_simdf_release(&rdgfa->f_list);
+ darray_simdf_release(&rdgfa->d_omega_list);
+#endif /* USE_SIMD */
}
static double
@@ -325,8 +368,31 @@ ssf_phase_rdgfa_setup
rdgfa->g = pow(1 + 4*k2*rdgfa->Rg2/(3*Df), -Df*0.5);
/* Precompute the phase function cumulative */
- res = compute_cumulative(rdgfa);
- if(res != RES_OK) goto error;
+ switch(args->simd) {
+ case SSF_SIMD_NONE:
+ res = compute_cumulative(rdgfa);
+ if(res != RES_OK) goto error;
+ break;
+ case SSF_SIMD_128:
+ #ifdef SSF_USE_SIMD_128
+ res = compute_cumulative_simd4(rdgfa);
+ if(res != RES_OK) goto error;
+ #else
+ res = RES_BAD_OP;
+ goto error;
+ #endif
+ break;
+ case SSF_SIMD_256:
+ #ifdef SSF_USE_SIMD_256
+ res = compute_cumulative_simd8(rdgfa);
+ if(res != RES_OK) goto error;
+ #else
+ res = RES_BAD_OP;
+ goto error;
+ #endif
+ break;
+ default: FATAL("Unreachable code.\n"); break;
+ }
exit:
return res;
diff --git a/src/ssf_phase_rdgfa_simdX.h b/src/ssf_phase_rdgfa_simdX.h
@@ -0,0 +1,183 @@
+/* Copyright (C) 2016-2018, 2021 |Meso|Star> (contact@meso-star.com)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#if !defined(SIMD_WIDTH__)
+ #error "Undefined SIMD_WIDTH__ macro"
+#endif
+#if SIMD_WIDTH__ != 4 && SIMD_WIDTH__ != 8
+ #error "Unexpected SIMD_WIDTH__ value of "STR(RSIMD_WIDTH__)
+#endif
+
+/* Check that internal macros are not already defined */
+#if defined(vXf__) \
+ || defined(vXf_T__) \
+ || defined(SIMD_FUNC__)
+ #error "Unexpected macro definition"
+#endif
+
+/* Macros generic to the SIMD_WIDTH__ */
+#define vXf_T CONCAT(CONCAT(v, SIMD_WIDTH__), f_T)
+#define vXf(Func) CONCAT(CONCAT(CONCAT(v, SIMD_WIDTH__), f_), Func)
+#define SIMD_FUNC(Func) CONCAT(CONCAT(Func, _simd), SIMD_WIDTH__)
+
+static INLINE vXf_T
+SIMD_FUNC(eval_f)(struct rdgfa* rdgfa, const vXf_T theta)
+{
+ /* Input arguments */
+ const vXf_T Df = vXf(set1)((float)rdgfa->fractal_dimension);
+
+ /* Precompute constants */
+ const vXf_T Rg2 = vXf(set1)((float)rdgfa->Rg2);
+ const vXf_T half_theta = vXf(mul)(theta, vXf(set1)(0.5f));
+
+ /* Precompute values */
+ const vXf_T sin_half_theta = vXf(sin)(half_theta);
+ const vXf_T q = vXf(mul)(vXf(set1)((float)rdgfa->cst_4pi_div_lambda), sin_half_theta);
+ const vXf_T q2Rg2 = vXf(mul)(vXf(mul)(q, q), Rg2);
+
+ /* Evaluate f(theta) when q2Rg2 < 1.5*Df */
+ const vXf_T val0 = vXf(exp)(vXf(mul)(vXf(set1)(-1.f/3.f), q2Rg2));
+
+ /* Evaluate f(theta) when q2Rg2 >= 1.5*Df */
+ const vXf_T tmp0 = vXf(div)(vXf(set1)((float)rdgfa->cst_3Df_div_2E), q2Rg2);
+ const vXf_T half_Df = vXf(mul)(Df, vXf(set1)(0.5f));
+ const vXf_T val1 = vXf(pow)(tmp0, half_Df);
+
+ /* Setup f */
+ const vXf_T mask = vXf(lt)(q2Rg2, vXf(mul)(Df, vXf(set1)(1.5f)));
+ const vXf_T f = vXf(sel)(val1, val0, mask);
+ return f;
+}
+
+static INLINE vXf_T
+SIMD_FUNC(eval2)
+ (struct rdgfa* rdgfa,
+ const vXf_T theta,
+ const vXf_T cos_theta)
+{
+ const vXf_T f = SIMD_FUNC(eval_f)(rdgfa, theta);
+ const vXf_T g = vXf(set1)((float)rdgfa->g);
+ const vXf_T cos2_theta = vXf(mul)(cos_theta, cos_theta);
+ const vXf_T cst0 = vXf(set1)(3.f/(16.f*(float)PI));
+ const vXf_T tmp0 = vXf(div)(f, g);
+ const vXf_T tmp1 = vXf(add)(vXf(set1)(1), cos2_theta);
+ const vXf_T phase = vXf(mul)(vXf(mul)(cst0, tmp0), tmp1);
+ return phase;
+}
+
+static INLINE vXf_T
+SIMD_FUNC(eval)(struct rdgfa* rdgfa, const vXf_T theta)
+{
+ return SIMD_FUNC(eval2)(rdgfa, theta, vXf(cos)(theta));
+}
+
+static INLINE res_T
+SIMD_FUNC(compute_cumulative)(struct rdgfa* rdgfa)
+{
+ vXf_T dtheta;
+ vXf_T theta1;
+ vXf_T step;
+ vXf_T two_PI;
+ float* f_list = NULL;
+ float* d_omega_list = NULL;
+ double* cdf = NULL;
+ size_t nangles;
+ size_t i;
+ res_T res = RES_OK;
+ ASSERT(rdgfa);
+
+ /* Force the number of angles to be a multiple of the SIMD width */
+ nangles = rdgfa->nintervals + 1;
+ nangles = (nangles + SIMD_WIDTH__-1)/ SIMD_WIDTH__ * SIMD_WIDTH__;
+
+ /* Allocate the cumulative array */
+ res = darray_double_resize(&rdgfa->cdf, rdgfa->nintervals);
+ if(res != RES_OK) goto error;
+
+ /* Allocate temporaries arrays */
+ res = darray_simdf_resize(&rdgfa->f_list, nangles);
+ if(res != RES_OK) goto error;
+ res = darray_simdf_resize(&rdgfa->d_omega_list, nangles);
+ if(res != RES_OK) goto error;
+
+ /* Fetch data */
+ cdf = darray_double_data_get(&rdgfa->cdf);
+ f_list = darray_simdf_data_get(&rdgfa->f_list);
+ d_omega_list = darray_simdf_data_get(&rdgfa->d_omega_list);
+
+ /* Compute the angular step for the angular domain */
+ rdgfa->dtheta = PI / (double)rdgfa->nintervals;
+
+ step = vXf(set1)((float)rdgfa->dtheta*(float)SIMD_WIDTH__);
+ dtheta = vXf(set1)((float)rdgfa->dtheta);
+ two_PI = vXf(set1)((float)(2*PI));
+#if SIMD_WIDTH__ == 4
+ theta1 = vXf(mul)(dtheta, vXf(set)(0, 1, 2, 3));
+#elif SIMD_WIDTH__ == 8
+ theta1 = vXf(mul)(dtheta, vXf(set)(0, 1, 2, 3, 4, 5, 6, 7));
+#endif
+
+ /* Compute f and d_omaga */
+ FOR_EACH(i, 0, nangles/SIMD_WIDTH__) {
+ /* Compute f */
+ const vXf_T f = SIMD_FUNC(eval)(rdgfa, theta1);
+
+ /* Compute d_omega */
+ const vXf_T theta2 = vXf(add)(theta1, dtheta);
+ const vXf_T tmp0 = vXf(mul)(vXf(add)(theta1, theta2), vXf(set1)(0.5f));
+ const vXf_T d_omega = vXf(mul)(vXf(mul)(two_PI, vXf(sin)(tmp0)), dtheta);
+
+ /* Store the result */
+ vXf(store)(&f_list[i*SIMD_WIDTH__], f);
+ vXf(store)(&d_omega_list[i*SIMD_WIDTH__], d_omega);
+
+ /* Go to the next angles */
+ theta1 = vXf(add)(theta1, step);
+ }
+
+ /* Compute the (unormalized) cumulative */
+ FOR_EACH(i, 0, rdgfa->nintervals) {
+ const double f1 = f_list[i+0];
+ const double f2 = f_list[i+1];
+ const double d_omega = d_omega_list[i];
+ const double tmp = (f1 + f2) * 0.5 * d_omega;
+ cdf[i] = (i == 0 ? tmp : tmp + cdf[i-1]);
+ }
+
+ /* Save the normalization factor */
+ rdgfa->rcp_normalize_factor = 1.0 / cdf[rdgfa->nintervals-1];
+
+ /* Finally normalize the CDF */
+ FOR_EACH(i, 0, rdgfa->nintervals) {
+ cdf[i] *= rdgfa->rcp_normalize_factor;
+ }
+
+exit:
+ return res;
+error:
+ darray_double_clear(&rdgfa->cdf);
+ darray_simdf_clear(&rdgfa->f_list);
+ darray_simdf_clear(&rdgfa->d_omega_list);
+ goto exit;
+}
+
+/* Undef generic macros */
+#undef vXf_T
+#undef vXf
+#undef SIMD_FUNC
+
+/* Undef parameter */
+#undef SIMD_WIDTH__
+
diff --git a/src/test_ssf_phase_rdgfa.c b/src/test_ssf_phase_rdgfa.c
@@ -16,6 +16,8 @@
#include "ssf.h"
#include "test_ssf_utils.h"
+#include <string.h>
+
int
main(int argc, char** argv)
{
@@ -30,9 +32,26 @@ main(int argc, char** argv)
struct ssp_rng* rng;
double cumulative_prev;
double wo[3];
+ int err = 0;
size_t i;
(void)argc, (void)argv;
+ if(argc <= 1) {
+ fprintf(stderr, "Usage: %s <simd_none|simd_128|simd_256>\n", argv[0]);
+ goto error;
+ }
+
+ if(!strcmp(argv[1], "simd_none")) {
+ args.simd = SSF_SIMD_NONE;
+ } else if(!strcmp(argv[1], "simd_128")) {
+ args.simd = SSF_SIMD_128;
+ } else if(!strcmp(argv[1], "simd_256")) {
+ args.simd = SSF_SIMD_256;
+ } else {
+ fprintf(stderr, "Invalid argument '%s'.\n", argv[1]);
+ goto error;
+ }
+
mem_init_proxy_allocator(&allocator, &mem_default_allocator);
CHK(ssp_rng_create(&allocator, &ssp_rng_mt19937_64, &rng) == RES_OK);
@@ -87,7 +106,7 @@ main(int argc, char** argv)
CHK(eq_eps(pdf, ssf_phase_eval(phase, wo, wi), fabs(pdf*1.e-6)));
CHK(d3_is_normalized(wi));
-#if 0
+#if 1
fprintf(stderr, "v %g %g %g\n", wi[0]*pdf, wi[1]*pdf, wi[2]*pdf);
fprintf(stderr, "p %lu\n", (unsigned long)(i+1));
#endif
@@ -100,6 +119,10 @@ main(int argc, char** argv)
check_memory_allocator(&allocator);
mem_shutdown_proxy_allocator(&allocator);
CHK(mem_allocated_size() == 0);
- return 0;
+exit:
+ return err;
+error:
+ err = -1;
+ goto exit;
}