rnatm

Load and structure data describing an atmosphere
git clone git://git.meso-star.fr/rnatm.git
Log | Files | Refs | README | LICENSE

commit 185312a43b7666353e43fb14f8e13c49b3c38122
parent 6930b5c8dd2215a5b32eae85fe1a6c0457431df0
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Wed, 24 Aug 2022 10:35:20 +0200

Rewrite the build_octree parallelisation

We need to synchronize construction threads when a batch of voxels has
been fully consumed. In the previous implementation, it was impossible
to add a barrier in the parallel loop. In this commit, we are rewriting
the build_octrees function to use a parallel block instead of a parallel
loop

Diffstat:
Msrc/rnatm_octree.c | 100++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
1 file changed, 58 insertions(+), 42 deletions(-)

diff --git a/src/rnatm_octree.c b/src/rnatm_octree.c @@ -804,15 +804,18 @@ voxelize_atmosphere batch_args.accel_structs = accel_structs + item_range[0]; batch_args.batch_size = item_range[1] - item_range[0]; - /* Ensure that the build thread has finished to consume the previous batch */ + /* Wait for the building thread to finish consuming the previous batch */ mutex_lock(sync->mutex); if(sync->ibatch != ibatch) { ASSERT(sync->ibatch == ibatch - 1); cond_wait(sync->cond, sync->mutex); - ASSERT(sync->ibatch == ibatch); + /* An error occured in the building thread */ + if(sync->ibatch != ibatch) res = RES_BAD_ARG; } mutex_unlock(sync->mutex); + if(res != RES_OK) goto error; + /* Generate the voxels of the current batch */ res = voxelize_batch(atm, &batch_args); if(res != RES_OK) goto error; } @@ -946,12 +949,12 @@ build_octrees struct build_sync* sync) { struct svx_device* svx = NULL; + struct accel_struct* accel_structs = NULL; double low[3], upp[3]; size_t def[3]; size_t istruct; size_t naccel_structs; size_t voxel_width; - ATOMIC nbuilt_structs = 0; ATOMIC res = RES_OK; ASSERT(atm && args && pool); @@ -969,56 +972,69 @@ build_octrees def[1] = (size_t)atm->grid_definition[1]; def[2] = (size_t)atm->grid_definition[2]; + accel_structs = darray_accel_struct_data_get(&atm->accel_structs); naccel_structs = darray_accel_struct_size_get(&atm->accel_structs); voxel_width = pool_get_voxel_width(pool); - /* Build the octrees. Each thread consumes an element of a partition. So, we - * set the number of threads to the voxel width */ - omp_set_num_threads((int)voxel_width); - #pragma omp parallel for schedule(static, 1/*chunk size*/) - for(istruct = 0; istruct < naccel_structs; ++istruct) { - struct build_octree_context ctx = BUILD_OCTREE_CONTEXT_NULL; - struct svx_voxel_desc vx_desc = SVX_VOXEL_DESC_NULL; - struct svx_tree* octree = NULL; - res_T res_local = RES_OK; - - if(ATOMIC_GET(&res) != RES_OK) continue; - - /* Setup the build context */ - ctx.pool = pool; - ctx.part = NULL; - ctx.iitem = istruct % voxel_width; - ctx.tau_threshold = args->optical_thickness; - - /* Setup the voxel descriptor */ - vx_desc.get = vx_get; - vx_desc.merge = vx_merge; - vx_desc.challenge_merge = vx_challenge_merge; - vx_desc.context = &ctx; - vx_desc.size = NFLOATS_PER_VOXEL * sizeof(float); - - res_local = svx_octree_create(svx, low, upp, def, &vx_desc, &octree); - if(ctx.part) partition_free(ctx.part); - if(res_local != RES_OK) { ATOMIC_SET(&res, res_local); continue; }; - - if((size_t)ATOMIC_INCR(&nbuilt_structs) % voxel_width == 0) { - /* Notify the thread voxelizing the atmospheric meshes that we are - * consuming for the next batch */ - mutex_lock(sync->mutex); - sync->ibatch += 1; - mutex_unlock(sync->mutex); - cond_signal(sync->cond); + /* Build the octrees. Each thread consumes an element of the voxels generated + * by the voxelization thread, each element corresponding to the voxel of an + * octree to be constructed. By fixing the number of threads to the width of + * the voxel, we therefore build `voxel_width' octrees in parallel from a + * single voxelization of the atmospheric meshes */ + for(istruct = 0; istruct < naccel_structs; istruct += voxel_width) { + const size_t nthreads = MMIN(voxel_width, naccel_structs - istruct); + omp_set_num_threads((int)nthreads); + + /* Note that we are using a parallel block rather than a parallel loop in + * order to add an implicit barrier after a batch has been fully consumed. + * This is necessary to prevent a thread from consuming voxels from the + * previous batch */ + #pragma omp parallel + { + struct build_octree_context ctx = BUILD_OCTREE_CONTEXT_NULL; + struct svx_voxel_desc vx_desc = SVX_VOXEL_DESC_NULL; + struct svx_tree* octree = NULL; + const int ithread = omp_get_thread_num(); + const size_t istruct_curr = (size_t)ithread + istruct; + res_T res_local = RES_OK; + + /* Setup the build context */ + ctx.pool = pool; + ctx.part = NULL; + ctx.iitem = (size_t)ithread; + ctx.tau_threshold = args->optical_thickness; + + /* Setup the voxel descriptor */ + vx_desc.get = vx_get; + vx_desc.merge = vx_merge; + vx_desc.challenge_merge = vx_challenge_merge; + vx_desc.context = &ctx; + vx_desc.size = NFLOATS_PER_VOXEL * sizeof(float); + + res_local = svx_octree_create(svx, low, upp, def, &vx_desc, &octree); + if(ctx.part) partition_free(ctx.part); + if(res_local != RES_OK) { + ATOMIC_SET(&res, res_local); + } else { /* Register the built octree */ + accel_structs[istruct_curr].octree = octree; + } } + if(res != RES_OK) goto error; - /* Register the built octree */ - darray_accel_struct_data_get(&atm->accel_structs)[istruct].octree = octree; + /* Signal the voxelization thread to generate the next batch */ + mutex_lock(sync->mutex); + sync->ibatch += 1; + mutex_unlock(sync->mutex); + cond_signal(sync->cond); } - if(res != RES_OK) goto error; exit: if(svx) SVX(device_ref_put(svx)); return (res_T)res; error: + /* Signal to the voxelization thread that there is no need to wait for the + * build thread */ + cond_signal(sync->cond); darray_accel_struct_clear(&atm->accel_structs); goto exit; }