! Enables complex or real versions from the same source code:
! complex_real.h: compile real or complex versions from the same source:


!-----------------------------------------------------------------------
      subroutine dr_gmres(n, Ax_sub, x, b, tol, info, rinfo, nx)
      !
      ! solve A x = b using optimized Dynamic Relaxation
      ! a.k.a. 2cd order Richardson iteration,
      ! which is like GMRES(2), but with a modified restarting procedure.
      !
      ! x on input contains the initial guess x0 (x0=0 allowed).
      ! x on output will contain the calculated solution
      !
      ! x and b are vectors of length n.
      !
      ! Ax_sub is a subroutine that evaluates A.x, and has the form:
      !
      ! call Ax_sub(n, x, Ax, nx)
      !
      ! where x is input, and on output Ax holds the value of A.x
      ! n is the length of the x and Ax vectors, and nx is another argument
      ! passed for convenience to the subroutine (nx=sqrt(n) for 2D grids 
      ! with an equal number of grid points in both directions.)
      !
      ! tol = relative error tolerance
      !
      ! output: 
      ! info(2) = number of iterations required to converge
      ! rinfo(2) = residual error when it converges
      !
      !-----------------------------------------------------------------

! Turns out that this algorithm (if alpha_limit=.false.) is exactly 
! equivalent to "Loose GMRES" LGMRES(1,1), as defined in:
! 
!  "A Technique for Accelerating the Convergence of Restarted GMRES"
! A. H. Baker, E. R. Jessup, T. Manteuffel, SIAM Journal on Matrix 
! Analysis and Applications Volume 26, Number 4, pp. 962-984 (2005) 
! http://epubs.siam.org/sam-bin/dbq/article/
!

      !
      !
      implicit none
      integer n, nx
      external Ax_sub
      complex*16 b(n)
      integer info(2)
      real*8 rinfo(2), tol

      complex*16 x(n)     ! the current estimate  x_{p}

      complex*16 x_pm1(n) ! the previous estimate x_{p-1}
      complex*16 x_pp1(n) ! the next estimate     x_{p+1}
      complex*16 r_p(n)   ! the residual error = Ax - b
      complex*16 r_pm1(n) ! the residual error for x_{p-1}
      complex*16 s1(n)    ! search direction 1
      complex*16 s2(n)    ! search direction 2
      complex*16 Ar(n)    ! A.r
      complex*16 As1(n)    ! A.s1
      complex*16 As2(n)    ! A.s2

      complex*16 a11, a12, a21, a22, det, alpha1, alpha2
      complex*16, parameter :: ONE = (1.0d0, 0.0d0)
      complex*16, parameter :: ZERO = (0.0d0, 0.0d0)
      complex*16 zdot, rhs1, rhs2
      real*8 err, sin2theta, bnorm
      integer iter, niter, n_Ax_evals, maxiter, j

      logical :: alpha_limit = .false.

      ! n_colin = # of times that search directions were co-linear:
      integer :: n_colin

      logical, save :: first_time=.true.
      real*8, save :: epsilon, sqrt_epsilon, sin2_crit
      real*8, save :: safe_min, sqrt_safe_min
      real*8 dlamch ! lapack routine to give machine precision parameters
      logical :: write_file = .true. ! write some diagnostics to a file


      if(first_time) then
         epsilon = dlamch("e")
         safe_min = dlamch("s")
         sqrt_epsilon = sqrt(epsilon)
         sqrt_safe_min = sqrt(safe_min)
         sin2_crit = 2.0*sqrt_epsilon
         ! sin2_crit = 0.1
         first_time = .false.
!         print *, "epsilon =", epsilon, "sqrt(epsilon) =", sqrt_epsilon
!         print *, "safe_min =", safe_min
! dlamch gives safe_min = 2.22507e-308 (same as matlab's realmin)
! dlamch gives:
! epsilon = 1.110223024625157E-16 sqrt(epsilon) = 1.053671212772351E-08
! while matlab's suggested technique gives eps = 2.2204...e-16 :
!------------------
! while (1+eps) > 1
!    eps = eps/2;
! end
! eps = eps*2
!-----------------
      endif
      n_colin = 0 ! # of times search directions were colinear
      n_Ax_evals = 0 ! # of A*x evaluations

! ?? GWH: being modified to parallel the MATLAB version of the code
! with alpha_limit=.false. for now

      ! initialization
      x_pm1 = x  ! initial guess

! The usual Krylov method is to search the Krylov space
! {r_0, A r_0, A^2 r_0, ...}.  For some types of cases where 
! an initial guess x_0 is supplied, and where that x_0 might be off 
! by just a (possibly complex) scale factor (as might be expected for 
! wave equations or linearized codes or for short time steps if 
! extrapolating from previous data in a time-dependent PDE solver), 
! then it could be that the initial search direction should be x_0.  
! Then the next search direction would naturally be r_0, then A r_0, 
! etc.  An initial search direction in the x_0 direction would be 
! straightforward to code, and requires no significant work over
! what is already needed, and so should probably always be done.
! 
! But haven't implemented this yet.

!     x = b    ! the initial residual gives the initial search direction

      ! find the initial search direction s = r = Ax-b:
      ! (note that we don't need the initial best fit here,
      ! just a vector in the direction of the best fit.)
      call Ax_sub(n, x_pm1, r_pm1, nx)
      r_pm1 = b - r_pm1
      n_Ax_evals = n_Ax_evals + 1
      

! Use a first order Richardson to calculate the optimal step size:

      bnorm = sqrt(zdot(b,b,n))

      err = sqrt(zdot(r_pm1,r_pm1,n))/bnorm
!     print *, "err_pm1=", err
      if(write_file) write(15,*) 
     &     "# iteration #,   relative error in residual"
      if(write_file) write(15,*) "0 ", err
!     print *, "err_p  =", err

      if (err .lt. tol) then
         info(2) = 0
         rinfo(2) = err
         ! dr_gmres neede no iterations because x0 was such a 
         ! good guess!
         return
      endif

      call Ax_sub(n, r_pm1, Ar, nx)
      alpha2 = zdot(Ar, r_pm1, n) / zdot(Ar, Ar, n)
      x = x_pm1 + alpha2*r_pm1
      n_Ax_evals = n_Ax_evals + 1

      ! (at first I just used alpha2=1, since the second step would 
      ! reoptimize the step size in the r_pm1 direction.  But optimizing
      ! alpha2 converges slightly faster (~10%) on most cases. The reason is
      ! that, given x_0=0 and x_1 = alpha2*b, although x_2 is independent of
      ! alpha2, x_3 depends on x_2-x_1, which does depend on alpha2.)

! initialization to reproduce GMRES(1):
! x = x_pm1
! sin2_crit = 2;

! ?? could use the recursion formula to determine r_p and err:
      call Ax_sub(n, x, r_p, nx)
      r_p = b - r_p
      n_Ax_evals = n_Ax_evals + 1

      err = sqrt(zdot(r_p,r_p,n))/bnorm
      if(write_file) write(15,*) "1 ", err

      if (err .lt. tol) then
         info(2) = 1
         rinfo(2) = err
      endif

      maxiter = 5000
      do iter = 1, maxiter

! new format in terms of general search directions s1 and s1:
         s1 = x - x_pm1
         s2 = r_p
         As1 = - (r_p - r_pm1)

         call Ax_sub(n, s2, As2, nx)
         n_Ax_evals = n_Ax_evals + 1

! could reduce memory access if some of these zdot loops were merged:
         a11 = zdot(As1, As1, n)
         a12 = zdot(As1, As2, n)
         a21 = conjg(a12)
         a22 = zdot(As2, As2, n)

         det = a11*a22 - a12*a21
!        print *, "iter =", iter
!        print *, "det =", det

! sin2theta is only a diagnostic (hack divide-by-zero fix):
         sin2theta = abs(det)/max(abs(a11*a22),sqrt_safe_min)
!        print *, "|sin(theta)|^2 =", sin2theta

         ! If the two search directions are nearly parallel, then the
         ! determinant is very small, and alpha1 and alpha1 are not 
         ! independently well determined, though the sum 
         ! alpha1*s1+alpha2*s2 is well determined, as is the improved 
         ! solution x_{p+1}.
         ! To handle this special case, set alpha1=0 and solve for
         ! alpha2 alone.  This is equivalent to reverting to simpler
         ! 1st order Richardson iteration = GMRES(1).
         ! (This special case will occur only when the residual is nearly
         ! an eigenvector of A and the solution is just about to converge.
         ! This problem was first discovered for Jin Chen's test case.)

         if(abs(det) .gt. sin2_crit*abs(a11*a22)) then

            rhs1 = zdot(As1, r_p, n)
            rhs2 = zdot(As2, r_p, n)

            alpha1 = ( a22*rhs1 - a12*rhs2 )/det
            alpha2 = (-a21*rhs1 + a11*rhs2 )/det
         
         else

            alpha1 = 0.0
            alpha2 = zdot(As2,r_p,n)/zdot(As2,As2,n)
            n_colin = n_colin + 1

         endif

         x_pm1 = x
         ! next estimate:
         x = x + alpha1*s1 + alpha2*s2

         r_pm1=r_p

         ! find the residual error of the new estimate:
         call Ax_sub(n, x, r_p, nx)
         r_p = b - r_p
         n_Ax_evals = n_Ax_evals + 1

! Could replace the above with a recursion formula for linear simulations

         err = sqrt(zdot(r_p,r_p,n))/bnorm
         if(write_file) write(15,*) iter, err

!         print *, "iter #, relative residual error=", iter, err
!         print *, "iter #", iter, "|sin(theta)|^2 =", sin2theta,
!     &        "err = ", err

         if(err .le. tol) goto 20

         ! the algorithm above might not be fully optimized,
         ! perhaps some steps could be reordered and merged.

      enddo

 20   niter = iter
      ! after finishing a do loop, the index is 1 higher than the max:
      if(niter .gt. maxiter) niter=maxiter
      info(2)=niter
      rinfo(2)=err

      if(write_file) then
         write (15, *) "# "
         write (15, *) "# search directions were co-linear", 
     &        n_colin, "times"
         write(15, *) "# # of evaluations of A.x = ", n_Ax_evals

         write (17, *) "# j    x(j)    b(j)      residual = b - A x"
         do j=1, n
            write (17,*) j, real(x(j)), real(b(j)), real(r_p(j))
         enddo
      endif

      return
      end subroutine dr_gmres
!
! Additional comments on the algorithm:
!
! GMRES (and LGMRES) apparently can suffer from stalling for certain
! types of non-positive-definite problems.  The talk (and associated paper)
! http://www.caam.rice.edu/~embree/householder.pdf
! shows examples where GMRES(2) fails on a diagonalizable matrix
! with real and positive eigenvalues.  Despite the fact that this
! example matrix is diagonalize with positive eigenvalues, the matrix
! is not positive definite, so that one can find vectors r for which
! r.A.r <= 0.  It is clear why the algorithm can fail in such a case.
! I'm more interested in cases (including advection-diffusion 
! hyperbolic-like problems and parabolic/elliptic problems) where 
! the matrix A might be non-symmetric but is still positive definite, 
! in which case I think most of these algorithms (including dr_gmres) 
! should always converge eventually.
! (The fact that positive-definite matrices can be non-symmetric is discussed
! at http://mathworld.wolfram.com/PositiveDefiniteMatrix.html.)
! On the other hand, perhaps it isn't as easy
! to analyze ODR for stagnation as standard Krylov methods, because
! the Krylov space is augmented by a previous error vector, and so
! it isn't yet clear to me how to do a fixed-point analysis for it.
!
! It may be that this stagnation problem is only an esoteric problem
! for unusual matrices.  I haven't run into it yet.  If it ever does
! occur, one might be able to detect when stagnation is happening and
! switch to a different algorithm or changes the choice of the
! Krylov subspace that is chosen, to be A^n.r_p instead of just r_p.
!-----------------------------------------------------------------------

      complex*16 function zdot(x,y,n)
      !
      ! calculate complex inner product (x,y) = x* . y

      integer i,n
      complex*16 x(n), y(n)

      zdot=0.0
      do i=1,n
         zdot=zdot+conjg(x(i))*y(i)
      enddo
      return
      end