commit 185312a43b7666353e43fb14f8e13c49b3c38122
parent 6930b5c8dd2215a5b32eae85fe1a6c0457431df0
Author: Vincent Forest <vincent.forest@meso-star.com>
Date: Wed, 24 Aug 2022 10:35:20 +0200
Rewrite the build_octree parallelisation
We need to synchronize construction threads when a batch of voxels has
been fully consumed. In the previous implementation, it was impossible
to add a barrier in the parallel loop. In this commit, we are rewriting
the build_octrees function to use a parallel block instead of a parallel
loop
Diffstat:
| M | src/rnatm_octree.c | | | 100 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------- |
1 file changed, 58 insertions(+), 42 deletions(-)
diff --git a/src/rnatm_octree.c b/src/rnatm_octree.c
@@ -804,15 +804,18 @@ voxelize_atmosphere
batch_args.accel_structs = accel_structs + item_range[0];
batch_args.batch_size = item_range[1] - item_range[0];
- /* Ensure that the build thread has finished to consume the previous batch */
+ /* Wait for the building thread to finish consuming the previous batch */
mutex_lock(sync->mutex);
if(sync->ibatch != ibatch) {
ASSERT(sync->ibatch == ibatch - 1);
cond_wait(sync->cond, sync->mutex);
- ASSERT(sync->ibatch == ibatch);
+ /* An error occured in the building thread */
+ if(sync->ibatch != ibatch) res = RES_BAD_ARG;
}
mutex_unlock(sync->mutex);
+ if(res != RES_OK) goto error;
+ /* Generate the voxels of the current batch */
res = voxelize_batch(atm, &batch_args);
if(res != RES_OK) goto error;
}
@@ -946,12 +949,12 @@ build_octrees
struct build_sync* sync)
{
struct svx_device* svx = NULL;
+ struct accel_struct* accel_structs = NULL;
double low[3], upp[3];
size_t def[3];
size_t istruct;
size_t naccel_structs;
size_t voxel_width;
- ATOMIC nbuilt_structs = 0;
ATOMIC res = RES_OK;
ASSERT(atm && args && pool);
@@ -969,56 +972,69 @@ build_octrees
def[1] = (size_t)atm->grid_definition[1];
def[2] = (size_t)atm->grid_definition[2];
+ accel_structs = darray_accel_struct_data_get(&atm->accel_structs);
naccel_structs = darray_accel_struct_size_get(&atm->accel_structs);
voxel_width = pool_get_voxel_width(pool);
- /* Build the octrees. Each thread consumes an element of a partition. So, we
- * set the number of threads to the voxel width */
- omp_set_num_threads((int)voxel_width);
- #pragma omp parallel for schedule(static, 1/*chunk size*/)
- for(istruct = 0; istruct < naccel_structs; ++istruct) {
- struct build_octree_context ctx = BUILD_OCTREE_CONTEXT_NULL;
- struct svx_voxel_desc vx_desc = SVX_VOXEL_DESC_NULL;
- struct svx_tree* octree = NULL;
- res_T res_local = RES_OK;
-
- if(ATOMIC_GET(&res) != RES_OK) continue;
-
- /* Setup the build context */
- ctx.pool = pool;
- ctx.part = NULL;
- ctx.iitem = istruct % voxel_width;
- ctx.tau_threshold = args->optical_thickness;
-
- /* Setup the voxel descriptor */
- vx_desc.get = vx_get;
- vx_desc.merge = vx_merge;
- vx_desc.challenge_merge = vx_challenge_merge;
- vx_desc.context = &ctx;
- vx_desc.size = NFLOATS_PER_VOXEL * sizeof(float);
-
- res_local = svx_octree_create(svx, low, upp, def, &vx_desc, &octree);
- if(ctx.part) partition_free(ctx.part);
- if(res_local != RES_OK) { ATOMIC_SET(&res, res_local); continue; };
-
- if((size_t)ATOMIC_INCR(&nbuilt_structs) % voxel_width == 0) {
- /* Notify the thread voxelizing the atmospheric meshes that we are
- * consuming for the next batch */
- mutex_lock(sync->mutex);
- sync->ibatch += 1;
- mutex_unlock(sync->mutex);
- cond_signal(sync->cond);
+ /* Build the octrees. Each thread consumes an element of the voxels generated
+ * by the voxelization thread, each element corresponding to the voxel of an
+ * octree to be constructed. By fixing the number of threads to the width of
+ * the voxel, we therefore build `voxel_width' octrees in parallel from a
+ * single voxelization of the atmospheric meshes */
+ for(istruct = 0; istruct < naccel_structs; istruct += voxel_width) {
+ const size_t nthreads = MMIN(voxel_width, naccel_structs - istruct);
+ omp_set_num_threads((int)nthreads);
+
+ /* Note that we are using a parallel block rather than a parallel loop in
+ * order to add an implicit barrier after a batch has been fully consumed.
+ * This is necessary to prevent a thread from consuming voxels from the
+ * previous batch */
+ #pragma omp parallel
+ {
+ struct build_octree_context ctx = BUILD_OCTREE_CONTEXT_NULL;
+ struct svx_voxel_desc vx_desc = SVX_VOXEL_DESC_NULL;
+ struct svx_tree* octree = NULL;
+ const int ithread = omp_get_thread_num();
+ const size_t istruct_curr = (size_t)ithread + istruct;
+ res_T res_local = RES_OK;
+
+ /* Setup the build context */
+ ctx.pool = pool;
+ ctx.part = NULL;
+ ctx.iitem = (size_t)ithread;
+ ctx.tau_threshold = args->optical_thickness;
+
+ /* Setup the voxel descriptor */
+ vx_desc.get = vx_get;
+ vx_desc.merge = vx_merge;
+ vx_desc.challenge_merge = vx_challenge_merge;
+ vx_desc.context = &ctx;
+ vx_desc.size = NFLOATS_PER_VOXEL * sizeof(float);
+
+ res_local = svx_octree_create(svx, low, upp, def, &vx_desc, &octree);
+ if(ctx.part) partition_free(ctx.part);
+ if(res_local != RES_OK) {
+ ATOMIC_SET(&res, res_local);
+ } else { /* Register the built octree */
+ accel_structs[istruct_curr].octree = octree;
+ }
}
+ if(res != RES_OK) goto error;
- /* Register the built octree */
- darray_accel_struct_data_get(&atm->accel_structs)[istruct].octree = octree;
+ /* Signal the voxelization thread to generate the next batch */
+ mutex_lock(sync->mutex);
+ sync->ibatch += 1;
+ mutex_unlock(sync->mutex);
+ cond_signal(sync->cond);
}
- if(res != RES_OK) goto error;
exit:
if(svx) SVX(device_ref_put(svx));
return (res_T)res;
error:
+ /* Signal to the voxelization thread that there is no need to wait for the
+ * build thread */
+ cond_signal(sync->cond);
darray_accel_struct_clear(&atm->accel_structs);
goto exit;
}