profiler_report Subroutine

public subroutine profiler_report(comm, rank, nprocs)

Generates a collective performance report across all MPI ranks.

Arguments

Type IntentOptional Attributes Name
type(MPI_Comm), intent(in) :: comm
integer, intent(in) :: rank
integer, intent(in) :: nprocs

Calls

proc~~profiler_report~~CallsGraph proc~profiler_report mod_profiler::profiler_report mpi_allreduce mpi_allreduce proc~profiler_report->mpi_allreduce proc~check_mpi~3 mod_profiler::check_mpi proc~profiler_report->proc~check_mpi~3

Called by

proc~~profiler_report~~CalledByGraph proc~profiler_report mod_profiler::profiler_report program~lowmach_react_hex lowmach_react_hex program~lowmach_react_hex->proc~profiler_report

Source Code

   subroutine profiler_report(comm, rank, nprocs)
      type(MPI_Comm), intent(in) :: comm
      integer, intent(in) :: rank
      integer, intent(in) :: nprocs

      integer :: i, ierr
      integer :: global_ntimers, global_nedges
      integer :: local_calls, global_calls
      real(rk) :: local_time
      real(rk) :: global_min, global_max, global_sum
      real(rk) :: avg_percent
      real(rk) :: total_avg_time
      real(rk), parameter :: tiny_time = 1.0e-300_rk

      real(rk) :: timer_min(MAX_TIMERS)
      real(rk) :: timer_max(MAX_TIMERS)
      real(rk) :: timer_avg(MAX_TIMERS)
      integer :: timer_calls(MAX_TIMERS)

      real(rk) :: edge_avg(MAX_EDGES)
      integer :: edge_calls(MAX_EDGES)

      if (.not. profiling_enabled) return

      if (stack_depth /= 0) then
         if (rank == 0) then
            write(error_unit,'(a,i0)') 'profiler: warning, nonzero stack depth at report: ', stack_depth
         end if
      end if

      call MPI_Allreduce(ntimers, global_ntimers, 1, MPI_INTEGER, MPI_MAX, comm, ierr)
      call check_mpi(ierr, 'profiler ntimers')

      if (global_ntimers > MAX_TIMERS) then
         if (rank == 0) write(error_unit,'(a)') 'profiler: too many timers in report'
         error stop 1
      end if

      if (nested_enabled) then
         call MPI_Allreduce(nedges, global_nedges, 1, MPI_INTEGER, MPI_MAX, comm, ierr)
         call check_mpi(ierr, 'profiler nedges')
      else
         global_nedges = 0
      end if

      timer_min = 0.0_rk
      timer_max = 0.0_rk
      timer_avg = 0.0_rk
      timer_calls = 0
      total_avg_time = tiny_time

      do i = 1, global_ntimers
         if (i <= ntimers) then
            local_time = timers(i)%total_time
            local_calls = timers(i)%calls
         else
            local_time = 0.0_rk
            local_calls = 0
         end if

         call MPI_Allreduce(local_time, global_min, 1, MPI_DOUBLE_PRECISION, MPI_MIN, comm, ierr)
         call check_mpi(ierr, 'profiler min')

         call MPI_Allreduce(local_time, global_max, 1, MPI_DOUBLE_PRECISION, MPI_MAX, comm, ierr)
         call check_mpi(ierr, 'profiler max')

         call MPI_Allreduce(local_time, global_sum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr)
         call check_mpi(ierr, 'profiler sum')

         call MPI_Allreduce(local_calls, global_calls, 1, MPI_INTEGER, MPI_MAX, comm, ierr)
         call check_mpi(ierr, 'profiler calls')

         timer_min(i) = global_min
         timer_max(i) = global_max
         timer_avg(i) = global_sum / max(real(nprocs, rk), tiny_time)
         timer_calls(i) = global_calls

         if (i <= ntimers) then
            if (trim(timers(i)%name) == 'Total_Simulation') then
               total_avg_time = max(timer_avg(i), tiny_time)
            end if
         end if
      end do

      edge_avg = 0.0_rk
      edge_calls = 0

      if (nested_enabled) then
         do i = 1, global_nedges
            if (i <= nedges) then
               local_time = edges(i)%total_time
               local_calls = edges(i)%calls
            else
               local_time = 0.0_rk
               local_calls = 0
            end if

            call MPI_Allreduce(local_time, global_sum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr)
            call check_mpi(ierr, 'profiler edge sum')

            call MPI_Allreduce(local_calls, global_calls, 1, MPI_INTEGER, MPI_MAX, comm, ierr)
            call check_mpi(ierr, 'profiler edge calls')

            edge_avg(i) = global_sum / max(real(nprocs, rk), tiny_time)
            edge_calls(i) = global_calls
         end do
      end if

      if (rank /= 0) return

      write(output_unit,'(a)') ' ======================================================================'
      write(output_unit,'(a)') '  PERFORMANCE PROFILING REPORT'
      write(output_unit,'(a)') '  Inclusive wall time in seconds; Avg% is relative to Total_Simulation.'
      write(output_unit,'(a)') '  Flat rows are not additive when nested timers are enabled.'
      write(output_unit,'(a)') ' ======================================================================'
      write(output_unit,'(a)') '                      Kernel Name     Calls            Min            Max            Avg        Avg%'
      write(output_unit,'(a)') ' ----------------------------------------------------------------------'

      do i = 1, global_ntimers
         avg_percent = 100.0_rk * timer_avg(i) / max(total_avg_time, tiny_time)

         if (i <= ntimers) then
            write(output_unit,'(1x,a32,1x,i9,3(1x,f14.6),1x,f10.2)') &
               trim(timers(i)%name), timer_calls(i), timer_min(i), timer_max(i), timer_avg(i), avg_percent
         else
            write(output_unit,'(1x,a32,1x,i9,3(1x,f14.6),1x,f10.2)') &
               'UNKNOWN_TIMER', timer_calls(i), timer_min(i), timer_max(i), timer_avg(i), avg_percent
         end if
      end do

      if (nested_enabled) then
         write(output_unit,'(a)') ' ======================================================================'
         write(output_unit,'(a)') '  NESTED PROFILING REPORT (Inclusive Child Time, Avg Across Ranks)'
         write(output_unit,'(a)') ' ======================================================================'
         write(output_unit,'(a)') '  Region Tree                                              Calls          Avg        Avg%'
         write(output_unit,'(a)') ' ----------------------------------------------------------------------'
         call print_children(0, 0, global_nedges, edge_avg, edge_calls, total_avg_time)
      end if

      write(output_unit,'(a)') ' ======================================================================'

   contains

      recursive subroutine print_children(parent, depth, edge_count, edge_avg_in, edge_calls_in, total_time)
         integer, intent(in) :: parent
         integer, intent(in) :: depth
         integer, intent(in) :: edge_count
         real(rk), intent(in) :: edge_avg_in(:)
         integer, intent(in) :: edge_calls_in(:)
         real(rk), intent(in) :: total_time

         integer :: e, child
         real(rk) :: pct
         character(len=256) :: label

         if (depth > 32) return

         do e = 1, edge_count
            if (e > nedges) cycle
            if (edges(e)%parent /= parent) cycle

            child = edges(e)%child
            if (child <= 0 .or. child > ntimers) cycle
            if (child == parent) cycle

            label = repeat('  ', depth)//'- '//trim(timers(child)%name)
            pct = 100.0_rk * edge_avg_in(e) / max(total_time, tiny_time)

            write(output_unit,'(2x,a52,1x,i9,1x,f12.6,1x,f10.2)') &
               label, edge_calls_in(e), edge_avg_in(e), pct

            call print_children(child, depth + 1, edge_count, edge_avg_in, edge_calls_in, total_time)
         end do
      end subroutine print_children

   end subroutine profiler_report