Congugate Gradient Method

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

A = np.array([[4,2], 
              [2,4]])
b = np.array([4,6])

xs = np.linalg.solve(A,b)
L,Q = np.linalg.eig(A)
Λ = np.diag(L)
print(Λ)

def f(x):
    return 0.5*np.dot(x, np.dot(A,x)) - np.dot(b,x)
def fminusf(e):
    return 0.5*np.dot(e, np.dot(A,e))


nx = 200
ny = 100
x = np.linspace(-3,3,nx)
y = np.linspace(-3,3,ny)
X,Y = np.meshgrid(x,y)                   # meshgrid puts x in rows and y in cols
CX, CY = np.zeros((ny,nx)), np.zeros((ny,nx))
Z  = np.zeros((ny, nx))
Z2 = np.zeros((ny, nx))

for i in range(nx):
    for j in range(ny):
        xx = np.array([x[i], y[j]])
        Z[j,i] = f(xx)
        Z2[j,i] = fminusf(xx)
        cc = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xx)
        CX[j,i] = cc[0]
        CY[j,i] = cc[1]
        
plt.rc('font', size=14)
        
fig = plt.figure(figsize=(5,5))
levels = np.linspace(np.min(Z), np.max(Z), 50)
plt.contour(X,Y,Z, levels);
plt.plot([0,0], [-3,3],':', color='gray')
plt.plot([-3,3],[0,0], ':', color='gray')
plt.plot(xs[0], xs[1], 'o', color='black')
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$')
plt.text(xs[0]-0.3, xs[1]-0.45, r'$x^*$');
[[6. 0.]
 [0. 2.]]
CJ

Gradient Descent Method

The Congjuage Gradient (CG) method is sometimes motivated by presenting the Gradient Decent method first. * The gradient of $f(x)$ is

$$\nabla f = Ax-b$$

* This points up the curve and is the direction of steepest ascent. * The direction of steepest descent is

$$-\nabla f = b-Ax.$$

* This is also the residual $r=b-Ax$. * Starting with an initial guess vector $x_0$, step along the descent direction $d=r$:

$$x_{k+1} = x_k + \alpha d_k.$$

* Note, we distinguish $d$ and $r$ here for convenience later when $d\ne r$.

Algorithm

def GD(A,b,x0, tol=1E-2, maxit=10000):
    
    xstep = np.empty(0)
    for i in range(1,maxit):
        xstep = np.append(xstep, x0)
        r = b - np.dot(A,x0)
        d = r
        α = np.dot(r,d)/np.dot(d,np.dot(A,d))
        x = x0 + α*d
        if np.linalg.norm(x-x0)/np.linalg.norm(x) <= tol:
            return x, xstep
        x0 = x.copy()
    
    print(f'warning, no convergence in {maxit} iterations')
    return x, xstep

#--------------------------------
    
A = np.array([[4,2], 
              [2,4]])
b = np.array([4,6])
x0 = np.array([2.5,-2])

x, xstep = GD(A,b,x0)
print(xstep)
xstep = np.reshape(xstep,(int(len(xstep)/2),2))
print(xstep[:,0])

#-------------------------------

plt.rc('font', size=14)
fig = plt.figure(figsize=(5,5))
plt.contour(X,Y,Z, levels);
plt.plot([0,0], [-3,3],':', color='gray')
plt.plot([-3,3],[0,0], ':', color='gray')
plt.plot(xs[0], xs[1], 'o', color='black')
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$')
plt.plot(xstep[:,0], xstep[:,1], 'k-');
[ 2.5        -2.          1.86567164  0.85447761  0.79870671  0.61737429
  0.66246077  1.23048101  0.43328982  1.17955413  0.40402586  1.31124192
  0.35480276  1.30030345  0.34851722  1.32858837]
[2.5        1.86567164 0.79870671 0.66246077 0.43328982 0.40402586
 0.35480276 0.34851722]
CJ

Center on the solution

Basis shift

$$AQ = Q\Lambda,$$

$$A = Q\Lambda Q^{-1} = Q\Lambda Q^T.$$

$$Av = Q\Lambda Q^Tv_1.$$ $$v_2^TAv_1 = \hat{v}_2^T\hat{v}_1$$.

Plot $f$ in the original and transformed spaces

fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
levels = np.linspace(np.min(Z), np.max(Z), 50)
plt.contour(X,Y,Z, levels);
plt.plot([0,0], [-3,3],':', color='gray')
plt.plot([-3,3],[0,0], ':', color='gray')
plt.plot(xs[0], xs[1], 'o', color='black')
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$')
plt.arrow(xs[0], xs[1], Q[0,0]/Λ[0,0]**0.5, Q[0,1]/Λ[0,0]**0.5, head_width=0.1, length_includes_head=True, color='blue')
plt.arrow(xs[0], xs[1], Q[1,0]/Λ[1,1]**0.5, Q[1,1]/Λ[1,1]**0.5, head_width=0.1, length_includes_head=True, color='orange');
plt.gca().set_aspect('equal', adjustable='box')

plt.subplot(1,2,2)
plt.contour(CX,CY,Z, levels);
plt.plot([0,0],[np.min(CX),np.max(CX)], color='gray', linestyle=':')
plt.plot([np.min(CX),np.max(CX)],[0,0], color='gray', linestyle=':')
cs = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xs)
plt.plot(cs[0], cs[1], 'o', color='black');
plt.xlim([np.min(CX), np.max(CX)])
plt.ylim([np.min(CX), np.max(CX)])
plt.xlabel(r'$\hat{v}_0=(\Lambda^{1/2}Q^Tx)_0$')
plt.ylabel(r'$\hat{v}_1=(\Lambda^{1/2}Q^Tx)_1$');
plt.gca().set_aspect('equal', adjustable='box')
v = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xs)
w = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), Q[:,0]/Λ[0,0]**0.5)
plt.arrow(v[0], v[1], w[0], w[1], head_width=0.1, length_includes_head=True, color='blue')
v = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xs)
w = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), Q[:,1]/Λ[1,1]**0.5)
plt.arrow(v[0], v[1], w[0], w[1], head_width=0.1, length_includes_head=True, color='orange');

plt.tight_layout()
CJ

Plot the Gradient Descent solution in the original and transformed spaces

fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
levels = np.linspace(np.min(Z), np.max(Z), 50)
plt.contour(X,Y,Z, levels);
plt.plot([0,0], [-3,3],':', color='gray')
plt.plot([-3,3],[0,0], ':', color='gray')
plt.plot(xs[0], xs[1], 'o', color='black')
plt.plot(xstep[:,0], xstep[:,1], 'k-');
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$')
plt.gca().set_aspect('equal', adjustable='box')

plt.subplot(1,2,2)
plt.contour(CX,CY,Z, levels);
plt.plot([0,0],[np.min(CX),np.max(CX)], color='gray', linestyle=':')
plt.plot([np.min(CX),np.max(CX)],[0,0], color='gray', linestyle=':')
cs = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xs)
plt.plot(cs[0], cs[1], 'o', color='black');
plt.xlim([np.min(CX), np.max(CX)])
plt.ylim([np.min(CX), np.max(CX)])
plt.xlabel(r'$\hat{x}_0=(\Lambda^{1/2}Q^Tx)_0$')
plt.ylabel(r'$\hat{x}_1=(\Lambda^{1/2}Q^Tx)_1$');
plt.gca().set_aspect('equal', adjustable='box')

cstep = xstep.copy()
for i in range(len(xstep[:,0])):
    cstep[i,:] = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xstep[i,:])
plt.plot(cstep[:,0], cstep[:,1], 'k-');

plt.tight_layout()
CJ

Conjugate directions

fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
levels = np.linspace(np.min(Z), np.max(Z), 50)
plt.contour(X,Y,Z, levels);
plt.plot([0,0], [-3,3],':', color='gray')
plt.plot([-3,3],[0,0], ':', color='gray')
plt.plot(xs[0], xs[1], 'o', color='black')
plt.xlabel(r'$x_0$')
plt.ylabel(r'$x_1$')
plt.gca().set_aspect('equal', adjustable='box')
xstep = xstep[0:3,:]
xstep[2,:] = xs

plt.plot(xstep[:,0], xstep[:,1], 'k-');

plt.subplot(1,2,2)
plt.contour(CX,CY,Z, levels);
plt.plot([0,0],[np.min(CX),np.max(CX)], color='gray', linestyle=':')
plt.plot([np.min(CX),np.max(CX)],[0,0], color='gray', linestyle=':')
cs = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xs)
plt.plot(cs[0], cs[1], 'o', color='black');
plt.xlim([np.min(CX), np.max(CX)])
plt.ylim([np.min(CX), np.max(CX)])
plt.xlabel(r'$\hat{x}_0=(\Lambda^{1/2}Q^Tx)_0$')
plt.ylabel(r'$\hat{x}_1=(\Lambda^{1/2}Q^Tx)_1$');
plt.gca().set_aspect('equal', adjustable='box')

cstep = xstep.copy()
for i in range(len(xstep[:,0])):
    cstep[i,:] = np.dot(np.dot(np.sqrt(Λ),np.transpose(Q)), xstep[i,:])
plt.plot(cstep[:,0], cstep[:,1], 'k-');
#print(np.dot(cstep[2,:]-cstep[1,:], cstep[1,:]-cstep[0,:]))

plt.tight_layout()
CJ

Mathematical details

Point 1

Algorithmic adjustments

Show that $r_i^Tr_j=0$ for $i\ne j$

That is, the residuals are mutually orthogonal. This is shown as follows.

This is true for arbitrary $k$, and we can therefore set $k=j$ and replace $k$ in this equation with $j$. We then insert this into Eq. (3):

$$\delta_j = \frac{d_j^T(r_j+\sum_{i=1}^{j-1}\alpha_iAd_i)}{d_j^TAd_j} = \frac{d_j^Tr_j}{d_j^TAd_j}\tag{4},$$

by A-orthogonality of the $d$’s.

Expressions for $\alpha$ and $\beta$

Conjugate Gradient Algorithm