I’ve been seeing some strange openmp scaling behavior that I’m not sure how to explain.
I have a simple test program that scales nicely when compiled with gfortran but poorly when compiled with ifort.
My test program is the following:
module parserMod
  use function_parser, only : fparser_array
  implicit none
  type(fparser_array), save :: parser
  !$omp threadprivate(parser)
end module parserMod
!-------------------------------------------------------------------------------
subroutine parallelMarbles(marbles, numThreads)
  use parserMod, only : parser
  use iso_fortran_env, only: wp => real64
  use iso_fortran_env, only : output_unit
  real(wp), dimension(6,200000), intent(inout) :: marbles
  integer, intent(in)                     :: numThreads
  integer :: indx
  character(len=1), dimension(3), parameter :: parserVars = ['x', 'y', 'z']
  ! All threads initialize the parser
  !$omp parallel num_threads(numThreads)
  call parser%parse(parserVars,parserVars)
  if (parser%error()) then
    call parser%print_errors(output_unit)
    stop 99
  endif
  !$omp end parallel
  !$omp parallel do default(none) &
  !$omp private(indx) &
  !$omp shared(marbles) &
  !$omp num_threads(numThreads)
  do indx = 1, size(marbles(1, :))    
      marbles(1,indx) = 1
      call doWork(marbles(:,indx))
  end do
  !$omp end parallel do
end subroutine parallelMarbles
!-------------------------------------------------------------------------------
subroutine doWork(marble)
  use omp_lib, only : omp_get_thread_num
  use parserMod, only : parser
  use iso_fortran_env, only: wp => real64
  
  implicit none
  real(wp), dimension(6), intent(inout) :: marble
  integer :: indx
  
  do indx = 1, 200
    marble(2) = mod(indx, 6 + omp_get_thread_num())*marble(1)
    marble(3) = mod(indx, 5 + omp_get_thread_num())*marble(1)
    marble(4) = mod(indx, 4 + omp_get_thread_num())*marble(1)
    call parser%evaluate(marble(1:3), marble(4:6))
    
    marble(1) = sum(marble(2:))
  end do
end subroutine doWork
!-------------------------------------------------------------------------------
program testOMP
  use iso_fortran_env, only: wp => real64
  ! real(wp), allocatable, dimension(:,:)    :: marbles
  real(wp), dimension(6,200000)    :: marbles
  integer                                  :: numThreads
  real                                     :: singleTime, threadTime
  integer :: startTime, endTime, countRate, countMax
  character(len=25)   :: varString
  ! allocate(marbles(6,200000))
  do numThreads = 1, 4, 3
    write(*,*) 'Calling parallel marbles with ', numThreads, ' threads.'  
    call system_clock(startTime, countRate, countMax)
    call parallelMarbles(marbles, numThreads)
    call system_clock(endTime)
    threadTime = (dble(endTime) - dble(startTime))/dble(countRate)
    write (varString, '(F25.6)') threadTime
    write (*, '(A)') ' Loop time = ' // trim(adjustl(varString)) // ' seconds.'
    if (numThreads .eq. 1) then
      singleTime = threadTime
    endif
    write (varString, '(F25.6)') singleTime / threadTime
     write (*, '(A)') ' Speedup = ' // trim(adjustl(varString)) // 'x.'
    write(*,*) '------------------------------------------------------'
  end do
end program testOMP
The test uses the fortran_function_parser module:
GitHub - jacobwilliams/fortran_function_parser: Modern Fortran Function Parser.
Any insight into what might be going wrong in ifort and how I might improve the performance with ifort would be greatly appreciated.
