# 1 "dr_gmres.F"
!-----------------------------------------------------------------------
      subroutine dr_gmres(n, Ax_sub, x, b, tol,
     &     info, rinfo)
      !
      ! solve linear or nonlinear A(x) = b using Optimized Dynamic Relaxation
      ! 
      ! a.k.a. 2cd order Richardson iteration,
      ! which is like GMRES(2), but with a modified restarting procedure.
      !
      ! x on input contains the initial guess x0 (x0=0 allowed).
      ! x on output will contain the calculated solution
      !
      ! x and b are vectors of length n.
      !
      ! Ax_sub is a subroutine that evaluates A.x, and has the form:
      !
      ! call Ax_sub(n, x, Ax)
      !
      ! where x is input, and on output Ax holds the value of A.x
      ! n is the length of the x and Ax vectors.
      !
      ! tol = relative error tolerance
      !
      ! output: 
      ! info(2) = number of iterations required to converge
      ! rinfo(2) = residual error when it converges
      !
      ! Set the switch "nonlinear" depending on your problem:
      !
      ! nonlinear = .true.  to solve the nonlinear problem A(x)=b
      !           = .false. to solve the linear problem A.x = b
      !-----------------------------------------------------------------

! Turns out that this algorithm (if alpha_limit=.false.) is exactly 
! equivalent to "Loose GMRES" LGMRES(1,1), as defined in:
! 
!  "A Technique for Accelerating the Convergence of Restarted GMRES"
! A. H. Baker, E. R. Jessup, T. Manteuffel, SIAM Journal on Matrix 
! Analysis and Applications Volume 26, Number 4, pp. 962-984 (2005) 
! http://epubs.siam.org/sam-bin/dbq/article/
!

! Enables complex or real versions from the same source code:
# 1 "complex_real.h" 1
! complex_real.h: compile real or complex versions from the same source:


# 45 "dr_gmres.F" 2

      !
      !
      implicit none
      integer n
      external Ax_sub
      complex*16 b(n)
      integer info(2)
      real*8 rinfo(2), tol

! some of these arrays could be merged/reused?

      complex*16 x(n)     ! the current estimate  x_{p}

      complex*16 x_pm1(n) ! the previous estimate x_{p-1}
!     complex*16 x_pp1(n) ! the next estimate     x_{p+1}
      complex*16 x_star(n)! provisional location  x_*

      complex*16 r_p(n)   ! the residual error = Ax - b
      complex*16 r_pm1(n) ! the residual error for x_{p-1}
      complex*16 s1(n)    ! search direction 1
      complex*16 s2(n)    ! search direction 2
      complex*16 As1(n)    ! A.s1
      complex*16 As2(n)    ! A.s2

      complex*16 a11, a12, a21, a22, det, alpha1, alpha2
      complex*16, parameter :: ONE = (1.0d0, 0.0d0)
      complex*16, parameter :: ZERO = (0.0d0, 0.0d0)
      complex*16 zdot, rhs1, rhs2
      real*8 err2, sin2theta, bnorm2, As2_mag2
      real*8 err2_pred, err2_warn, err2_change, dx, xmag2, err2_pm1
      integer iter, niter, n_Ax_evals, maxiter, j

! algorithmic options:
!     logical :: nonlinear = .true. ! =.t. to solve nonlinear A(x)=b
      logical :: nonlinear = .false.! =.f. to solve    linear A(x)=b
      logical :: alpha_limit = .false.
      logical :: recursive_residual
      logical :: recursive_error

      ! n_colin = # of times that search directions were co-linear:
      integer :: n_colin = 0
      ! number of times extrapolation larger than expected:
      integer, save :: n_extrap_errs

      logical, save :: first_time=.true.
      real*8, save :: eps, sqrt_eps, sin2_crit
      real*8, save :: safe_min, sqrt_safe_min
      real*8 dlamch ! lapack routine to give machine precision parameters
      logical :: write_file = .true. ! write some diagnostics to a file

! For linear problems, we can use a recursion relationship to calculate 
! the residual vector without needing an extra evaluation of A.x, and 
! to calculate the norm of the residual vector (the error) without needing
! an extra dot product.
!
! For nonlinear problems, need to explicitly evaluate the residual (unless 
! the error is sufficiently small that the problem is locally linear).
!
      if(nonlinear) then
         recursive_residual = .false.
         recursive_error    = .false.
      else
         recursive_residual = .false.
!        recursive_residual = .true.
         recursive_error    = .false.
!        recursive_error    = .true.
      endif

! In tests, in most cases there seems to be little difference
! between recursive_residual = .true. or .false., as long as 
! recursive_error=.false.  (On the Morgan problem with V=0, both converged
! in 77 iterations with:
!
! full calc          error =  8.732767716053962E-10
! recursive apparent error =  8.732762114896063E-10
! recursive actual   error =  8.733032579929747E-10
!
! with recursive_error and recursive_residual both true, the code
! fails to converge on the Morgan test case.  (But only because it
! thinks the error is 1.7e-8, while the actual final error after
! 5000 iterations is 4.477e-13!)
!
! For now, it is best to not use the recursive formulas.  One can probably
! recursion to work by using the exact formulas every n'th iteration
! with (n~10).

      if(first_time) then
! epsilon and tiny are F90 intrinsic routines:
         eps = epsilon(eps) ! 1.0+eps barely > 0
!        eps = dlamch("e")
         safe_min = tiny(safe_min)
!        safe_min = dlamch("s")
         sqrt_eps = sqrt(eps)
         sqrt_safe_min = sqrt(safe_min)
         sin2_crit = 2.0*sqrt_eps
         ! sin2_crit = 0.1
         n_extrap_errs = 0

         first_time = .false.
!        print *, "epsilon =", eps, "sqrt(epsilon) =", sqrt_eps
!        print *, "safe_min =", safe_min
!
! F90's tiny routine gives tiny = 2.225073858507201E-308, same
! as matlab's realmin and dlamch("s")
!
! F90 epsilon routine gives eps = 2.220446049250313E-16 same as matlab's
! suggested techique:
!------------------
! while (1+eps) > 1
!    eps = eps/2;
! end
! eps = eps*2
!-----------------
!
! However, lapack/blas's dlamch("e") gives:
! epsilon = 1.110223024625157E-16
!

      endif

      n_Ax_evals = 0 ! # of A*x evaluations

! ?? GWH: being modified to parallel the MATLAB version of the code
! with alpha_limit=.false. for now

      ! initialization
      x_pm1 = x  ! initial guess

! The usual Krylov method is to search the Krylov space
! {r_0, A r_0, A^2 r_0, ...}.  For some types of cases where 
! an initial guess x_0 is supplied, and where that x_0 might be off 
! by just a (possibly complex) scale factor (as might be expected for 
! wave equations or linearized codes or for short time steps if 
! extrapolating from previous data in a time-dependent PDE solver), 
! then it could be that the initial search direction should be x_0.  
! Then the next search direction would naturally be r_0, then A r_0, 
! etc.  An initial search direction in the x_0 direction would be 
! straightforward to code, and requires no significant work over
! what is already needed, and so should probably always be done.
! 
! But haven't implemented this yet.

!     x = b    ! the initial residual gives the initial search direction

      ! find the initial search direction s = r = Ax-b:
      ! (note that we don't need the initial best fit here,
      ! just a vector in the direction of the best fit.)
      call Ax_sub(n, x_pm1, r_pm1)
      r_pm1 = b - r_pm1
      n_Ax_evals = n_Ax_evals + 1
      

! Use a first order Richardson to calculate the optimal step size:

      bnorm2 = zdot(b,b,n)

      err2 = zdot(r_pm1,r_pm1,n)/bnorm2

      if(write_file) write(15,*) 
     & "# iteration #, relative error, recursive relative error,",
     & " predicted err, sin(theta)"
      if(write_file) write(15,*) "0 ", sqrt(err2), sqrt(err2)

      if (err2 .lt. tol**2) then
         info(2) = 0
         rinfo(2) = sqrt(err2)
         ! dr_gmres neede no iterations because x0 was such a good guess!
         return
      endif

      if(nonlinear) then
         xmag2 = zdot(x_pm1, x_pm1, n)
         dx = 10*sqrt_eps*max(1.0, sqrt(xmag2/bnorm2)/err2)
!        dx = 1.0
         x_star = x_pm1 + dx*r_pm1
         call Ax_sub(n, x_star, As2)
         As2 = (As2-b+r_pm1)/dx
         n_Ax_evals = n_Ax_evals + 1
      else
         call Ax_sub(n, r_pm1, As2)
         n_Ax_evals = n_Ax_evals + 1
      endif

      As2_mag2 = zdot(As2, As2, n)
      alpha2 = zdot(As2, r_pm1, n) / As2_mag2
      x = x_pm1 + alpha2*r_pm1

      ! (at first I just used alpha2=1, since the second step would 
      ! reoptimize the step size in the r_pm1 direction.  But optimizing
      ! alpha2 converges slightly faster (~10%) on most cases. The reason is
      ! that, given x_0=0 and x_1 = alpha2*b, although x_2 is independent of
      ! alpha2, x_3 depends on x_2-x_1, which does depend on alpha2.)

      ! initialization to reproduce GMRES(1):
      ! x = x_pm1
      ! sin2_crit = 2;

      err2_change = abs(alpha2)**2 *As2_mag2/bnorm2
      err2_pred = err2 -      err2_change
      err2_warn = err2_pred + 0.5*max(err2_change, 2*tol*err2_pred)

      err2_pm1 = err2

      if(recursive_residual) then
         r_p = r_pm1 - alpha2*As2
      else
         ! explicitly recalculate residual with an extra A.x evaluation:
         call Ax_sub(n, x, r_p)
         r_p = b - r_p
         n_Ax_evals = n_Ax_evals + 1
      endif

      if(recursive_error) then
         err2 = err2_pred
      else
         err2 = zdot(r_p,r_p,n)/bnorm2
      endif

      ! in principle, can determine err recursively as well as the residual,
      ! and eliminate a dot product, as I do above.  But I don't know whether
      ! this recursive error calculation is more sensitive to build up of
      ! round off error than the recursive residual calculation is.
      ! Furthermore, dot products are relatively cheap compared to the 
      ! A.x matrix-vector multiply (although not a whole lot cheaper for 
      ! very sparse A), so one might consider always calculating err
      ! explicitly.

      if(err2 .gt. err2_warn) then
         n_extrap_errs = n_extrap_errs+1
         if( n_extrap_errs .le. 10) then
            write(*,*) "Warning from dr_gmres, iter # 1"
            write(*,*) "Prev.  err =", sqrt(err2_pm1)
            write(*,*) "Actual err =", sqrt(err2)
            write(*,*) "Pred.  err =", sqrt(err2_pred)
            write(*,*) "Warn.  err =", sqrt(err2_warn)
         endif
      endif

      if(write_file) write(15,*) "1 ", sqrt(err2), 
     &                         sqrt(max(err2_pred,0.0)), "0.0"

      if (err2 .lt. tol**2) then
         info(2) = 1
         rinfo(2) = sqrt(err2)
      endif

      maxiter = 5000
      do iter = 2, maxiter

! new format in terms of general search directions s1 and s1:
         s1 = x - x_pm1
         s2 = r_p
         As1 = - (r_p - r_pm1)

         if(nonlinear) then
! (Could be slightly more careful here and above in how the step size 
! dx below is chosen when evaluating numerical derivatives.  See Knoll
! and Keyes JCP 2004 (Eqs. 11-15).  One probably doesn't need to
! to calculate xmag2 every iteration, which would speed it up a little.
! But the current approach should work well for our present case.)
            xmag2 = zdot(x, x, n)
            dx = 10*sqrt_eps*max(1.0, sqrt(xmag2/bnorm2)/err2)
!           dx = 1
            x_star = x + dx*s2
            call Ax_sub(n, x_star, As2)
            As2 = (As2-b+r_p)/dx
            n_Ax_evals = n_Ax_evals + 1
         else
            call Ax_sub(n, s2, As2)
            n_Ax_evals = n_Ax_evals + 1
         endif

! could reduce memory access if some of these zdot loops were merged:
         a11 = zdot(As1, As1, n)
         a12 = zdot(As1, As2, n)
         a21 = conjg(a12)
         a22 = zdot(As2, As2, n)

         det = a11*a22 - a12*a21
!        print *, "iter =", iter
!        print *, "det =", det

! sin2theta is only a diagnostic (hack divide-by-zero fix):
         sin2theta = abs(det)/max(abs(a11*a22),sqrt_safe_min)
!        print *, "|sin(theta)|^2 =", sin2theta

         ! If the two search directions are nearly parallel, then the
         ! determinant is very small, and alpha1 and alpha1 are not 
         ! independently well determined, though the sum 
         ! alpha1*s1+alpha2*s2 is well determined, as is the improved 
         ! solution x_{p+1}.
         ! To handle this special case, set alpha1=0 and solve for
         ! alpha2 alone.  This is equivalent to reverting to simpler
         ! 1st order Richardson iteration = GMRES(1).
         ! (This special case will occur only when the residual is nearly
         ! an eigenvector of A and the solution is just about to converge.
         ! This problem was first discovered for Jin Chen's test case.)

         rhs1 = zdot(As1, r_p, n)
         rhs2 = zdot(As2, r_p, n)

         if(abs(det) .gt. sin2_crit*abs(a11*a22) .or. 
     &        abs(a11) .lt. bnorm2*1.e-10) then

            alpha1 = ( a22*rhs1 - a12*rhs2 )/det
            alpha2 = (-a21*rhs1 + a11*rhs2 )/det
         
         else

            alpha1 = 0.0
            alpha2 = rhs2/zdot(As2,As2,n)
            n_colin = n_colin + 1

         endif

         x_pm1 = x
         ! next estimate:
         x = x + alpha1*s1 + alpha2*s2

         r_pm1=r_p

         err2_change = real(conjg(alpha1)*rhs1
     &                    + conjg(alpha2)*rhs2)/bnorm2
         err2_pred = err2 -      err2_change
         err2_warn = err2 - 0.5*err2_change

         err2_pm1 = err2

         ! find the residual error of the new estimate:
         if(recursive_residual) then
            r_p = r_p - alpha1*As1 - alpha2*As2
         else
            ! explicitly recalculate residual with an extra A.x evaluation:
            call Ax_sub(n, x, r_p)
            r_p = b - r_p
            n_Ax_evals = n_Ax_evals + 1
         endif

         if(recursive_error) then
            err2 = err2_pred
         else
            err2 = zdot(r_p,r_p,n)/bnorm2
         endif

      ! in principle, can determine err recursively as well as the residual,
      ! and eliminate a dot product, as I do above.  But I don't know whether
      ! this recursive error calculation is more sensitive to build up of
      ! round off error than the recursive residual calculation is.
      ! Furthermore, dot products are relatively cheap compared to the 
      ! A.x matrix-vector multiply (although not a whole lot cheaper for 
      ! very sparse A), so one might consider always calculating err
      ! explicitly.

         if(err2 .gt. err2_warn) then
            n_extrap_errs = n_extrap_errs+1
            if (n_extrap_errs .le. 10) then
               write(*,*) "Warning from dr_gmres, iter #", iter
               write(*,*) "Prev.  err =", sqrt(err2_pm1)
               write(*,*) "Actual err =", sqrt(err2)
               write(*,*) "Pred.  err =", sqrt(err2_pred)
               write(*,*) "Warn.  err =", sqrt(err2_warn)
               write(*,*) "sin^2(theta) =", sin2theta
               write(*,*) "alpha1, alpha2 =", alpha1, alpha2
               write(*,*) "1-D opt alpha2 =", rhs2/zdot(As2,As2,n)

               write(*,*) "a11, a12       =", a11, a12
               write(*,*) "a21, a22       =", a21, a22
               write(*,*) "rh1, rhs2      =", rhs1, rhs2
               write(*,*) "det, bnorm2 =", det, bnorm2
            endif
         endif

         if(write_file) write(15,*) iter, sqrt(abs(err2)), 
     &        sqrt(abs(err2_pred)), err2_pred, sin2theta

!         print *, "iter #, relative residual error=", iter, sqrt(err2)
!         print *, "iter #", iter, "|sin(theta)|^2 =", sin2theta,
!     &        "err = ", err

         if(err2 .le. tol**2) goto 20

         ! the algorithm above might not be fully optimized,
         ! perhaps some steps could be reordered and merged.

      enddo

 20   niter = iter
      ! after finishing a do loop, the index is 1 higher than the max:
      if(niter .gt. maxiter) niter=maxiter
      info(2)=niter
      rinfo(2)=sqrt(abs(err2))

! DEBUG: calculate actual final error as a check:
!      call Ax_sub(n, x, r_p)
!      r_p = b - r_p
!      err2 = zdot(r_p,r_p,n)/bnorm2
!      print *, " actual final err = ", sqrt(err2)

      if(write_file) then
         write (15, *) "# "
         write (15, *) "# search directions were co-linear", 
     &        n_colin, "times"
         write(15, *) "# # of evaluations of A.x = ", n_Ax_evals

         write (17, *) "# j    x(j)    b(j)      residual = b - A x"
         do j=1, n
            write (17,*) j, real(x(j)), real(b(j)), real(r_p(j))
         enddo
      endif

      if(n_extrap_errs .gt. 0) then
         write(*,*) "Warning from dr_gmres, # of extrap errors = ",
     &        n_extrap_errs
      endif

      return
      end subroutine dr_gmres
!
! Additional comments on the algorithm:
!
! GMRES (and LGMRES) apparently can suffer from stalling for certain
! types of non-positive-definite problems.  The talk (and associated paper)
! http://www.caam.rice.edu/~embree/householder.pdf
! shows examples where GMRES(2) fails on a diagonalizable matrix
! with real and positive eigenvalues.  Despite the fact that this
! example matrix is diagonalize with positive eigenvalues, the matrix
! is not positive definite, so that one can find vectors r for which
! r.A.r <= 0.  It is clear why the algorithm can fail in such a case.
! I'm more interested in cases (including advection-diffusion 
! hyperbolic-like problems and parabolic/elliptic problems) where 
! the matrix A might be non-symmetric but is still positive definite, 
! in which case I think most of these algorithms (including dr_gmres) 
! should always converge eventually.
! (The fact that positive-definite matrices can be non-symmetric is discussed
! at http://mathworld.wolfram.com/PositiveDefiniteMatrix.html.)
! On the other hand, perhaps it isn't as easy
! to analyze ODR for stagnation as standard Krylov methods, because
! the Krylov space is augmented by a previous error vector, and so
! it isn't yet clear to me how to do a fixed-point analysis for it.
!
! It may be that this stagnation problem is only an esoteric problem
! for unusual matrices.  I haven't run into it yet.  If it ever does
! occur, one might be able to detect when stagnation is happening and
! switch to a different algorithm or changes the choice of the
! Krylov subspace that is chosen, to be A^n.r_p instead of just r_p.
!-----------------------------------------------------------------------

      complex*16 function zdot(x,y,n)
      !
      ! calculate complex inner product (x,y) = x* . y

      integer i,n
      complex*16 x(n), y(n)

      zdot=0.0
      do i=1,n
         zdot=zdot+conjg(x(i))*y(i)
      enddo
      return
      end