subroutine xmcfft(isign, n, m, scale, x, inc1x, inc2x, y, > inc1y, inc2y, table, ntable, work, nwork) c c multitasking complex-to-complex FFT's. c A more efficient replacement for CRAY's MCFFT, by c Greg Hammett, hammett@princeton.edu, 6-Oct-1993 c Princeton University Plasma Physics Lab c (with some help from Gary Kerbel, NERSC, gdk@kerbel.nersc.gov) c c This is plug-compatible with mcfft, so type "man mcfft" for help. c c define arguments: implicit none integer isign,n,m,inc1x,inc2x,inc1y,inc2y,ntable,nwork real x(2,inc2x,m) real y(2,inc2y,m) real work(4*n*m),scale integer table(ntable) !part of table is real, and part integer c define local variables : (space before : to fool emacs...) integer maxcpus parameter (maxcpus=16) ! max # of parallel CPUs integer j,k,ipar,ncpus,nucpus,getenv,status integer firstcall, mstart(maxcpus), mdo(maxcpus) external getenv character*20 string real veclen save ncpus, firstcall data ncpus /-1/ data firstcall /1/ c check that inputs are reasonable: if(ntable .lt. (100+2*n)) then write(6,*) "Error in xmcfft: ntable too small !" stop endif c initialize trig factors if necessary: if(isign .eq. 0) then call cftfax(n,table(11),table(31)) table(4)=n firstcall=0 return endif c reinitialize trig factors if necessary: if(table(4) .ne. n .or. firstcall .eq. 1)then call cftfax(n,table(11),table(31)) table(4)=n firstcall=0 endif if(nwork .lt. 4*n*m) then write(6,*) "Error in xmcfft: nwork too small !" stop endif c get the NCPUS environment variable, the max # of CPUS to try to use... if(ncpus .le. 0)then string=' ' status=getenv('NCPUS',string) ncpus=1 if(status .ne. 0) then read(string,'(i10)') ncpus endif if(ncpus .le. 0) ncpus=1 endif c c figure out NUCPUS, the # of parallel CPUs to actually try to use: c c we are vectorizing and parallelizing in the same dimension (the c second dimension of the arrays), which has a length of m. c So m=veclen*nucpus, where veclen is the vector length, which c needs to be greater than 16 for efficiency. c nucpus=max(m/16,1) ! want vector length of at least 16 nucpus=min(nucpus,ncpus) veclen = float(m)/float(nucpus) ! average vector length do ipar=1,nucpus mstart(ipar)=1+nint((ipar-1)*veclen) enddo do ipar=1,nucpus-1 mdo(ipar)=mstart(ipar+1)-mstart(ipar) enddo mdo(nucpus)=m+1-mstart(ipar) c write(6,*) 'mdo =',mdo c copy to output vector, and scale (if necessary?)... if (scale .ne. 1.0) then do j=1,n do k=1,m y(1,1+(j-1)*inc1y,k) = scale*x(1,1+(j-1)*inc1x,k) y(2,1+(j-1)*inc1y,k) = scale*x(2,1+(j-1)*inc1x,k) enddo enddo else c don't bother copying the array x to y if they share the c same storage space. if(loc(y(1,1,1)) .ne. loc(x(1,1,1)) .or. & inc1y .ne. inc1x .or. inc2y .ne. inc2x) then do j=1,n do k=1,m y(1,1+(j-1)*inc1y,k) = x(1,1+(j-1)*inc1x,k) y(2,1+(j-1)*inc1y,k) = x(2,1+(j-1)*inc1x,k) enddo enddo endif endif c The cncall compiler directive below for the CRAY preprocessor fpp tells c it that all subroutines in the loop are parallel (i.e., that there c are no conflicts in trying to write to the same data...) c cfpp$ permutation (mstart) ! irrelelvant? cfpp$ cncall do ipar=1,nucpus call cfftmlt(y(1,1,mstart(ipar)), > y(2,1,mstart(ipar)), > work(1+4*n*(mstart(ipar)-1)),table(31),table(11), > 2*inc1y,2*inc2y,n,mdo(ipar),isign) enddo return end