module fft_work ! ! (c) Copyright 1991 to 1998 by Michael A. Beer, William D. Dorland, ! P. B. Snyder, Q. P. Liu, and Gregory W. Hammett. ALL RIGHTS RESERVED. ! ! ccfft ! UNICOS: ! table(100 + 8*n) ! work(8*n) ! UNICOS/mk: ! table(2*n) ! work(4*n) ! Origin: ! table(30 + 2*n) ! work(2*n) integer, parameter :: ccfft_table0 = 100 integer, parameter :: ccfft_table1 = 8 integer, parameter :: ccfft_work0 = 0 integer, parameter :: ccfft_work1 = 8 ! xmcfft ! UNICOS: ! table(100 + 2*n) ! work(4*lot*n) integer, parameter :: ccfftm_table0 = 100 integer, parameter :: ccfftm_table1 = 2 integer, parameter :: ccfftm_work0 = 0 integer, parameter :: ccfftm_work1 = 4 ! csfftm/scfftm ! UNICOS: ! table(100 + 2*n) ! work((2*n+4)*lot) integer, parameter :: csfftm_table0 = 100 integer, parameter :: csfftm_table1 = 2 integer, parameter :: csfftm_work1 = 2 integer, parameter :: csfftm_work2 = 4 ! csfft/scfft ! UNICOS: ! table(100 + 4*n) ! work(4 + 4*n) ! UNICOS/mk: ! table(2*n) ! work(2*n) ! Origin: ! table(15 + n) ! work(n) integer, parameter :: csfft_table0 = 100 integer, parameter :: csfft_table1 = 4 integer, parameter :: csfft_work0 = 4 integer, parameter :: csfft_work1 = 4 interface subroutine ccfft(i,n,scale,x,y,table,work,isys) integer :: i, n, isys real :: scale complex, dimension(n) :: x, y real, dimension(*) :: table real, dimension(*) :: work end subroutine ccfft end interface interface subroutine csfft(i,n,scale,x,y,table,work,isys) integer :: i, n, isys real :: scale complex, dimension(n/2+1) :: x real, dimension(n) :: y real, dimension(*) :: table real, dimension(*) :: work end subroutine csfft end interface interface subroutine scfft(i,n,scale,x,y,table,work,isys) integer :: i, n, isys real :: scale complex, dimension(n/2+1) :: y real, dimension(n) :: x real, dimension(*) :: table real, dimension(*) :: work end subroutine scfft end interface contains subroutine xmcfft(isign, n, m, scale, x, inc1x, inc2x, y,& inc1y, inc2y, table, ntable, work, nwork) ! ! multitasking complex-to-complex FFT's. ! A more efficient replacement for CRAY's MCFFT, by ! Greg Hammett, hammett@princeton.edu, 6-Oct-1993 ! Princeton University Plasma Physics Lab ! (with some help from Gary Kerbel, NERSC, gdk@kerbel.nersc.gov) ! ! This is plug-compatible with mcfft, so type "man mcfft" for help. ! ! define arguments: implicit none integer isign,n,m,inc1x,inc2x,inc1y,inc2y,ntable,nwork real x(2,inc2x,m) real y(2,inc2y,m) real work(4*n*m),scale integer table(ntable) !part of table is real, and part integer ! define local variables : (space before : to fool emacs...) integer maxcpus parameter (maxcpus=16) ! max # of parallel CPUs integer j,k,ipar,ncpus,nucpus,getenv,status integer firstcall, mstart(maxcpus), mdo(maxcpus) external getenv character*20 string real veclen save ncpus, firstcall data ncpus /-1/ data firstcall /1/ ! check that inputs are reasonable: if(ntable .lt. (100+2*n)) then write(6,*) "Error in xmcfft: ntable too small !" stop endif ! initialize trig factors if necessary: if(isign .eq. 0) then call cftfax(n,table(11),table(31)) table(4)=n firstcall=0 return endif ! reinitialize trig factors if necessary: if(table(4) .ne. n .or. firstcall .eq. 1)then call cftfax(n,table(11),table(31)) table(4)=n firstcall=0 endif if(nwork .lt. 4*n*m) then write(6,*) "Error in xmcfft: nwork too small !" stop endif ! get the NCPUS environment variable, the max # of CPUS to try to use... if(ncpus .le. 0)then string=' ' status=getenv('NCPUS',string) ncpus=1 if(status .ne. 0) then read(string,'(i10)') ncpus endif if(ncpus .le. 0) ncpus=1 endif ! ! figure out NUCPUS, the # of parallel CPUs to actually try to use: ! ! we are vectorizing and parallelizing in the same dimension (the ! second dimension of the arrays), which has a length of m. ! So m=veclen*nucpus, where veclen is the vector length, which ! needs to be greater than 16 for efficiency. nucpus=max(m/16,1) ! want vector length of at least 16 nucpus=min(nucpus,ncpus) veclen = float(m)/float(nucpus) ! average vector length do ipar=1,nucpus mstart(ipar)=1+nint((ipar-1)*veclen) enddo do ipar=1,nucpus-1 mdo(ipar)=mstart(ipar+1)-mstart(ipar) enddo mdo(nucpus)=m+1-mstart(ipar) ! write(6,*) 'mdo =',mdo ! copy to output vector, and scale (if necessary?)... if (scale .ne. 1.0) then do j=1,n do k=1,m y(1,1+(j-1)*inc1y,k) = scale*x(1,1+(j-1)*inc1x,k) y(2,1+(j-1)*inc1y,k) = scale*x(2,1+(j-1)*inc1x,k) enddo enddo else ! don't bother copying the array x to y if they share the ! same storage space. if(loc(y(1,1,1)) .ne. loc(x(1,1,1)) .or. & inc1y .ne. inc1x .or. inc2y .ne. inc2x) then do j=1,n do k=1,m y(1,1+(j-1)*inc1y,k) = x(1,1+(j-1)*inc1x,k) y(2,1+(j-1)*inc1y,k) = x(2,1+(j-1)*inc1x,k) enddo enddo endif endif ! The cncall compiler directive below for the CRAY preprocessor fpp tells ! it that all subroutines in the loop are parallel (i.e., that there ! are no conflicts in trying to write to the same data...) ! cfpp$ permutation (mstart) ! irrelelvant? !fpp$ cncall do ipar=1,nucpus call cfftmlt(y(1,1,mstart(ipar)),y(2,1,mstart(ipar)), & work(1+4*n*(mstart(ipar)-1)),table(31),table(11),& 2*inc1y,2*inc2y,n,mdo(ipar),isign) enddo end subroutine xmcfft end module fft_work