t-nissieの日記: 【電脳】FFTWのOpenMP並列でいまいち3次元r2c, c2rの速さが出ない【その3】
$ for i in `jot 3`; do OMP_NUM_THREADS=8 ./r2c_3d_test_omp 32 32 160 100000 8; done ; for i in `jot 3`; do OMP_NUM_THREADS=8 ./r2c_3d_test_threads 32 32 160 100000 8; done
Lx = 32, Ly = 32, Lz = 160, N = 163840, M = 100000, NTHREADS = 8
Address of r = 0x0000000100400000
Address of c = 0x0000000100540000
FFT starts
FFT ends 256.37600
Lx = 32, Ly = 32, Lz = 160, N = 163840, M = 100000, NTHREADS = 8
Address of r = 0x0000000100400000
Address of c = 0x0000000100540000
FFT starts
FFT ends 254.20400
Lx = 32, Ly = 32, Lz = 160, N = 163840, M = 100000, NTHREADS = 8
Address of r = 0x0000000100400000
Address of c = 0x0000000100540000
FFT starts
FFT ends 263.79200
Lx = 32, Ly = 32, Lz = 160, N = 163840, M = 100000, NTHREADS = 8
Address of r = 0x0000000100400000
Address of c = 0x0000000100540000
FFT starts
FFT ends 239.95000
Lx = 32, Ly = 32, Lz = 160, N = 163840, M = 100000, NTHREADS = 8
Address of r = 0x0000000100400000
Address of c = 0x0000000100540000
FFT starts
FFT ends 237.44200
Lx = 32, Ly = 32, Lz = 160, N = 163840, M = 100000, NTHREADS = 8
Address of r = 0x0000000100400000
Address of c = 0x0000000100540000
FFT starts
FFT ends 236.12100
$
! r2c_3d_test.F -*-f90-*-
! Time-stamp: <2011-11-29 15:53:27 takeshi>
! Author: Takeshi NISHIMATSU
!!
#if defined(__PGI) || defined(SR11000) || defined(__sparc)
# define command_argument_count iargc
# define get_command_argument getarg
#endif
program r2c_3d_test
implicit none
real*8, allocatable :: r(:,:,:)
complex*16, allocatable :: c(:,:,:)
integer*8 :: plan_r2c, plan_c2r, address, count0, count1, count_rate
character(len=30) :: str
integer :: Lx, Ly, Lz, M, NTHREADS, i, j, ireturn
real*8 :: N_inv
# include "fftw3.f"
call get_command_argument(1,str); read(str,*) Lx
call get_command_argument(2,str); read(str,*) Ly
call get_command_argument(3,str); read(str,*) Lz
call get_command_argument(4,str); read(str,*) M
call get_command_argument(5,str); read(str,*) NTHREADS
write(6,'(3(a,i4),2(a,i7),a,i3)') 'Lx = ', Lx, &
& ', Ly = ', Ly, &
& ', Lz = ', Lz, &
& ', N = ', Lx*Ly*Lz, &
& ', M = ', M, &
& ', NTHREADS = ', NTHREADS
N_inv = 1.0d0 / Lx / Ly / Lz
call dfftw_init_threads(ireturn)
call dfftw_plan_with_nthreads(NTHREADS)
allocate(r(0:Lx-1, 0:Ly-1, 0:Lz-1))
allocate(c(0:Lx/2, 0:Ly-1, 0:Lz-1))
write (*,'(a,z16.16)') 'Address of r = 0x', address(r) ! Check 16-bit alignment,
write (*,'(a,z16.16)') 'Address of c = 0x', address(c) ! or SSE2 won't be used.
call dfftw_plan_dft_r2c_3d(plan_r2c, Lx, Ly, Lz, r, c, FFTW_PATIENT)
call dfftw_plan_dft_c2r_3d(plan_c2r, Lx, Ly, Lz, c, r, FFTW_PATIENT)
r(:,:,:) = 0.1d0
write(6,'(a)') 'FFT starts'
call flush(6)
call system_clock(count0)
do i = 1, M
call dfftw_execute(plan_r2c)
!$omp parallel do
do j = 0, Lz-1
c(:,:,j) = c(:,:,j) * N_inv
end do
!$omp end parallel do
call dfftw_execute(plan_c2r)
end do
call system_clock(count1, count_rate)
write(6,'(a,f10.5)') 'FFT ends', dble(count1-count0)/count_rate
call dfftw_cleanup_threads(ireturn)
end program r2c_3d_test
!Local variables:
! compile-command: "make -k && ./r2c_3d_test_omp 32 32 160 100 4"
!End:
【電脳】FFTWのOpenMP並列でいまいち3次元r2c, c2rの速さが出ない【その3】 More | Reply ログイン