Improve work stealing strategy - htrdr - Solving radiative transfer in heterogeneous media

commit de9190697bfec46def5015569d0a2e9b29154ae1
parent 10fe56bef42a403625762f1ff1045cdbbd36b2a6
Author: Vincent Forest <vincent.forest@meso-star.com>
Date:   Wed, 31 Oct 2018 10:50:32 +0100

Improve work stealing strategy

Previously, the draw function was waiting for the completion of the
rendering threads and then submitted a steal request to the concurrent
processes. Some rendering threads were thus waiting for the completion
of the other rendering threads without performing any computation.

In this commit a thread with no more work does not wait anymore and
directly try to steal tiles to concurrent processes.

Diffstat:
M src/htrdr_draw_radiance_sw.c  | 144 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------

1 file changed, 76 insertions(+), 68 deletions(-)
diff --git a/src/htrdr_draw_radiance_sw.c b/src/htrdr_draw_radiance_sw.c
@@ -242,7 +242,7 @@ mpi_steal_work
   size_t nthieves = 0;
   uint32_t tiles[UINT8_MAX]; /* Morton code of the stolen tile */
   int proc_to_steal; /* Process to steal */
-  uint8_t ntiles_to_steal = (uint8_t)(htrdr->nthreads*2);
+  uint8_t ntiles_to_steal = MMIN((uint8_t)(htrdr->nthreads*2), 16);
   ASSERT(htrdr && rng && work && htrdr->nthreads < UINT8_MAX);
 
   /* Protect MPI calls of multiple invocations from concurrent threads */
@@ -294,6 +294,11 @@ mpi_gather_buffer
   res_T res = RES_OK;
   ASSERT(htrdr && buf);
 
+  /* The process is alone. There is nothing to gather since all work was done
+   * by it */
+  if(htrdr->mpi_nprocs == 1)
+    goto exit;
+
   /* Fetch the memory layout of the submitted buffer */
   htrdr_buffer_get_layout(buf, &layout);
   ASSERT(layout.elmt_size == sizeof(struct htrdr_accum) * 3/*#channels*/);
@@ -474,86 +479,89 @@ draw_image
   pix_sz[0] = 1.0 / (double)layout.width;
   pix_sz[1] = 1.0 / (double)layout.height;
 
-  nthreads = htrdr->nthreads;
   proc_ntiles = proc_work_get_ntiles(work);
+  nthreads = MMIN(htrdr->nthreads, proc_ntiles);
 
   /* The process is not considered as a working process for himself */
   htrdr->mpi_working_procs[htrdr->mpi_rank] = 0;
   --htrdr->mpi_nworking_procs;
 
-  do {
-    omp_set_num_threads((int)nthreads);
-
-    #pragma omp parallel
-    for(;;) {
-      const int ithread = omp_get_thread_num();
-      struct ssp_rng_proxy* rng_proxy = NULL;
-      struct ssp_rng* rng;
-      uint32_t mcode;
-      size_t tile_org[2];
-      size_t tile_sz[2];
-      size_t n;
-      res_T res_local = RES_OK;
-      int32_t pcent;
-
-      if(ATOMIC_GET(&res) != RES_OK) continue;
-
+  omp_set_num_threads((int)nthreads);
+  #pragma omp parallel
+  for(;;) {
+    const int ithread = omp_get_thread_num();
+    struct ssp_rng_proxy* rng_proxy = NULL;
+    struct ssp_rng* rng;
+    uint32_t mcode;
+    size_t tile_org[2];
+    size_t tile_sz[2];
+    size_t n;
+    res_T res_local = RES_OK;
+    int32_t pcent;
+
+    /* Get a tile to draw */
+    #pragma omp critical 
+    { 
       mcode = proc_work_get_tile(work);
-      if(mcode == TILE_MCODE_NULL) break;
-
-      /* Decode the morton code to retrieve the tile index  */
-      tile_org[0] = morton2D_decode((uint32_t)(mcode>>0));
-      tile_org[1] = morton2D_decode((uint32_t)(mcode>>1));
-      ASSERT(tile_org[0] < ntiles_x && tile_org[1] < ntiles_y);
-
-      /* Define the tile origin in pixel space */
-      tile_org[0] *= TILE_SIZE;
-      tile_org[1] *= TILE_SIZE;
-
-      /* Compute the size of the tile clamped by the borders of the buffer */
-      tile_sz[0] = MMIN(TILE_SIZE, layout.width - tile_org[0]);
-      tile_sz[1] = MMIN(TILE_SIZE, layout.height - tile_org[1]);
-
-      SSP(rng_proxy_create2
-        (&htrdr->lifo_allocators[ithread],
-         &ssp_rng_threefry,
-         RNG_SEQUENCE_SIZE * (size_t)mcode, /* Offset */
-         RNG_SEQUENCE_SIZE, /* Size */
-         RNG_SEQUENCE_SIZE * (size_t)ntiles_adjusted, /* Pitch */
-         1, &rng_proxy));
-      SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
-
-      res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
-        pix_sz, cam, spp, rng, buf);
-
-      SSP(rng_proxy_ref_put(rng_proxy));
-      SSP(rng_ref_put(rng));
-
-      if(res_local != RES_OK) {
-        ATOMIC_SET(&res, res_local);
-        continue;
+      if(mcode == TILE_MCODE_NULL) { /* No more work on this process */
+        /* Try to steal works to concurrent processes */
+        proc_work_reset(work);
+        nthieves = mpi_steal_work(htrdr, rng_main, work);
+        if(nthieves != 0) {
+          mcode = proc_work_get_tile(work);
+        }
       }
+    }
+    if(mcode == TILE_MCODE_NULL) break; /* No more work */
+
+    /* Decode the morton code to retrieve the tile index  */
+    tile_org[0] = morton2D_decode((uint32_t)(mcode>>0));
+    tile_org[1] = morton2D_decode((uint32_t)(mcode>>1));
+    ASSERT(tile_org[0] < ntiles_x && tile_org[1] < ntiles_y);
+
+    /* Define the tile origin in pixel space */
+    tile_org[0] *= TILE_SIZE;
+    tile_org[1] *= TILE_SIZE;
+
+    /* Compute the size of the tile clamped by the borders of the buffer */
+    tile_sz[0] = MMIN(TILE_SIZE, layout.width - tile_org[0]);
+    tile_sz[1] = MMIN(TILE_SIZE, layout.height - tile_org[1]);
+
+    SSP(rng_proxy_create2
+      (&htrdr->lifo_allocators[ithread],
+       &ssp_rng_threefry,
+       RNG_SEQUENCE_SIZE * (size_t)mcode, /* Offset */
+       RNG_SEQUENCE_SIZE, /* Size */
+       RNG_SEQUENCE_SIZE * (size_t)ntiles_adjusted, /* Pitch */
+       1, &rng_proxy));
+    SSP(rng_proxy_create_rng(rng_proxy, 0, &rng));
+
+    res_local = draw_tile(htrdr, (size_t)ithread, mcode, tile_org, tile_sz,
+      pix_sz, cam, spp, rng, buf);
+
+    SSP(rng_proxy_ref_put(rng_proxy));
+    SSP(rng_ref_put(rng));
+
+    if(res_local != RES_OK) {
+      ATOMIC_SET(&res, res_local);
+      break;
+    }
 
-      n = (size_t)ATOMIC_INCR(&nsolved_tiles);
-      pcent = (int32_t)((double)n * 100.0 / (double)proc_ntiles + 0.5/*round*/);
+    n = (size_t)ATOMIC_INCR(&nsolved_tiles);
+    pcent = (int32_t)((double)n * 100.0 / (double)proc_ntiles + 0.5/*round*/);
 
-      #pragma omp critical
-      if(pcent > htrdr->mpi_progress_render[0]) {
-        htrdr->mpi_progress_render[0] = pcent;
-        if(htrdr->mpi_rank == 0) {
-          update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
-        } else { /* Send the progress percentage to the master process */
-          send_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING, pcent);
-        }
+    #pragma omp critical
+    if(pcent > htrdr->mpi_progress_render[0]) {
+      htrdr->mpi_progress_render[0] = pcent;
+      if(htrdr->mpi_rank == 0) {
+        update_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING);
+      } else { /* Send the progress percentage to the master process */
+        send_mpi_progress(htrdr, HTRDR_MPI_PROGRESS_RENDERING, pcent);
       }
     }
+  }
 
-    if(ATOMIC_GET(&res) != RES_OK) goto error;
-
-    proc_work_reset(work);
-    nthieves = mpi_steal_work(htrdr, rng_main, work);
-    nthreads = MMIN(nthieves, htrdr->nthreads);
-  } while(nthieves);
+  if(ATOMIC_GET(&res) != RES_OK) goto error;
 
   /* Synchronize the process */
   mutex_lock(htrdr->mpi_mutex);

	htrdr Solving radiative transfer in heterogeneous media
	git clone git://git.meso-star.fr/htrdr.git
	Log \| Files \| Refs \| README \| LICENSE