Generates a collective performance report across all MPI ranks.
| Type | Intent | Optional | Attributes | Name | ||
|---|---|---|---|---|---|---|
| type(MPI_Comm), | intent(in) | :: | comm | |||
| integer, | intent(in) | :: | rank | |||
| integer, | intent(in) | :: | nprocs |
subroutine profiler_report(comm, rank, nprocs) type(MPI_Comm), intent(in) :: comm integer, intent(in) :: rank integer, intent(in) :: nprocs integer :: i, ierr integer :: global_ntimers, global_nedges integer :: local_calls, global_calls real(rk) :: local_time real(rk) :: global_min, global_max, global_sum real(rk) :: avg_percent real(rk) :: total_avg_time real(rk), parameter :: tiny_time = 1.0e-300_rk real(rk) :: timer_min(MAX_TIMERS) real(rk) :: timer_max(MAX_TIMERS) real(rk) :: timer_avg(MAX_TIMERS) integer :: timer_calls(MAX_TIMERS) real(rk) :: edge_avg(MAX_EDGES) integer :: edge_calls(MAX_EDGES) if (.not. profiling_enabled) return if (stack_depth /= 0) then if (rank == 0) then write(error_unit,'(a,i0)') 'profiler: warning, nonzero stack depth at report: ', stack_depth end if end if call MPI_Allreduce(ntimers, global_ntimers, 1, MPI_INTEGER, MPI_MAX, comm, ierr) call check_mpi(ierr, 'profiler ntimers') if (global_ntimers > MAX_TIMERS) then if (rank == 0) write(error_unit,'(a)') 'profiler: too many timers in report' error stop 1 end if if (nested_enabled) then call MPI_Allreduce(nedges, global_nedges, 1, MPI_INTEGER, MPI_MAX, comm, ierr) call check_mpi(ierr, 'profiler nedges') else global_nedges = 0 end if timer_min = 0.0_rk timer_max = 0.0_rk timer_avg = 0.0_rk timer_calls = 0 total_avg_time = tiny_time do i = 1, global_ntimers if (i <= ntimers) then local_time = timers(i)%total_time local_calls = timers(i)%calls else local_time = 0.0_rk local_calls = 0 end if call MPI_Allreduce(local_time, global_min, 1, MPI_DOUBLE_PRECISION, MPI_MIN, comm, ierr) call check_mpi(ierr, 'profiler min') call MPI_Allreduce(local_time, global_max, 1, MPI_DOUBLE_PRECISION, MPI_MAX, comm, ierr) call check_mpi(ierr, 'profiler max') call MPI_Allreduce(local_time, global_sum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr) call check_mpi(ierr, 'profiler sum') call MPI_Allreduce(local_calls, global_calls, 1, MPI_INTEGER, MPI_MAX, comm, ierr) call check_mpi(ierr, 'profiler calls') timer_min(i) = global_min timer_max(i) = global_max timer_avg(i) = global_sum / max(real(nprocs, rk), tiny_time) timer_calls(i) = global_calls if (i <= ntimers) then if (trim(timers(i)%name) == 'Total_Simulation') then total_avg_time = max(timer_avg(i), tiny_time) end if end if end do edge_avg = 0.0_rk edge_calls = 0 if (nested_enabled) then do i = 1, global_nedges if (i <= nedges) then local_time = edges(i)%total_time local_calls = edges(i)%calls else local_time = 0.0_rk local_calls = 0 end if call MPI_Allreduce(local_time, global_sum, 1, MPI_DOUBLE_PRECISION, MPI_SUM, comm, ierr) call check_mpi(ierr, 'profiler edge sum') call MPI_Allreduce(local_calls, global_calls, 1, MPI_INTEGER, MPI_MAX, comm, ierr) call check_mpi(ierr, 'profiler edge calls') edge_avg(i) = global_sum / max(real(nprocs, rk), tiny_time) edge_calls(i) = global_calls end do end if if (rank /= 0) return write(output_unit,'(a)') ' ======================================================================' write(output_unit,'(a)') ' PERFORMANCE PROFILING REPORT' write(output_unit,'(a)') ' Inclusive wall time in seconds; Avg% is relative to Total_Simulation.' write(output_unit,'(a)') ' Flat rows are not additive when nested timers are enabled.' write(output_unit,'(a)') ' ======================================================================' write(output_unit,'(a)') ' Kernel Name Calls Min Max Avg Avg%' write(output_unit,'(a)') ' ----------------------------------------------------------------------' do i = 1, global_ntimers avg_percent = 100.0_rk * timer_avg(i) / max(total_avg_time, tiny_time) if (i <= ntimers) then write(output_unit,'(1x,a32,1x,i9,3(1x,f14.6),1x,f10.2)') & trim(timers(i)%name), timer_calls(i), timer_min(i), timer_max(i), timer_avg(i), avg_percent else write(output_unit,'(1x,a32,1x,i9,3(1x,f14.6),1x,f10.2)') & 'UNKNOWN_TIMER', timer_calls(i), timer_min(i), timer_max(i), timer_avg(i), avg_percent end if end do if (nested_enabled) then write(output_unit,'(a)') ' ======================================================================' write(output_unit,'(a)') ' NESTED PROFILING REPORT (Inclusive Child Time, Avg Across Ranks)' write(output_unit,'(a)') ' ======================================================================' write(output_unit,'(a)') ' Region Tree Calls Avg Avg%' write(output_unit,'(a)') ' ----------------------------------------------------------------------' call print_children(0, 0, global_nedges, edge_avg, edge_calls, total_avg_time) end if write(output_unit,'(a)') ' ======================================================================' contains recursive subroutine print_children(parent, depth, edge_count, edge_avg_in, edge_calls_in, total_time) integer, intent(in) :: parent integer, intent(in) :: depth integer, intent(in) :: edge_count real(rk), intent(in) :: edge_avg_in(:) integer, intent(in) :: edge_calls_in(:) real(rk), intent(in) :: total_time integer :: e, child real(rk) :: pct character(len=256) :: label if (depth > 32) return do e = 1, edge_count if (e > nedges) cycle if (edges(e)%parent /= parent) cycle child = edges(e)%child if (child <= 0 .or. child > ntimers) cycle if (child == parent) cycle label = repeat(' ', depth)//'- '//trim(timers(child)%name) pct = 100.0_rk * edge_avg_in(e) / max(total_time, tiny_time) write(output_unit,'(2x,a52,1x,i9,1x,f12.6,1x,f10.2)') & label, edge_calls_in(e), edge_avg_in(e), pct call print_children(child, depth + 1, edge_count, edge_avg_in, edge_calls_in, total_time) end do end subroutine print_children end subroutine profiler_report