Actual source code: ex1kok.kokkos.cxx
  1: static char help[] = "Benchmarking device kernel launch time\n";
  2: /*
  3:   Running example on Summit at OLCF:
  4:   # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
  5:   $ jsrun -n1 -a1 -c7 -g1 -r1  ./ex1kok
  6:   Average asynchronous device kernel launch time = 4.86 microseconds
  7:   Average synchronous device kernel launch time  = 12.83 microseconds
  9:   Frontier@OLCF
 10:   $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1kok
 11:   Average asynchronous device kernel launch time = 1.88 microseconds
 12:   Average synchronous device kernel launch time  = 7.78 microseconds
 14:   Aurora@ALCF
 15:   $ mpirun -n 1 ./ex1kok
 16:   Average asynchronous device kernel launch time = 3.34 microseconds
 17:   Average synchronous device kernel launch time  = 6.24 microseconds
 19:   Perlmutter@NERSC
 20:   $ srun -n 1 --gpus-per-task=1 ./ex1kok
 21:   Average asynchronous device kernel launch time = 2.31 microseconds
 22:   Average synchronous device kernel launch time  = 7.13 microseconds
 23: */
 25: #include <petscsys.h>
 26: #include <petsc_kokkos.hpp>
 28: int main(int argc, char **argv)
 29: {
 30:   PetscInt       i, n = 100000, N = 256;
 31:   PetscLogDouble tstart, tend, time;
 33:   PetscFunctionBeginUser;
 34:   PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
 35:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
 36:   PetscCall(PetscKokkosInitializeCheck());
 37:   {
 38:     Kokkos::DefaultExecutionSpace                      exec = PetscGetKokkosExecutionSpace();
 39:     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N);
 41:     PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below
 42:     // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one
 43:     PetscCall(PetscTime(&tstart));
 44:     for (i = 0; i < n; i++) { PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){})); }
 45:     PetscCall(PetscTime(&tend));
 46:     PetscCallCXX(exec.fence());
 47:     time = (tend - tstart) * 1e6 / n;
 48:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time));
 50:     // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed
 51:     PetscCall(PetscTime(&tstart));
 52:     for (i = 0; i < n; i++) {
 53:       PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
 54:       PetscCallCXX(exec.fence());
 55:     }
 56:     PetscCall(PetscTime(&tend));
 57:     time = (tend - tstart) * 1e6 / n;
 58:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time  = %.2f microseconds\n", time));
 59:   }
 61:   PetscCall(PetscFinalize());
 62:   return 0;
 63: }
 65: /*TEST
 66:   test:
 67:     requires: kokkos
 68:     args: -n 2
 69:     output_file: output/empty.out
 70:     filter: grep "DOES_NOT_EXIST"
 72: TEST*/