import numpy as np
from matplotlib import pyplot as plt
FONTSIZE=18
plt.rcParams['figure.figsize']=(15,8)
plt.rcParams['font.size']=FONTSIZE
Some things we'll neeed to do
Here I want some data where our model will hit a local minimum. We'll make some quartic data and intentionally give the model a bad starting point. For this contrived example, gradient descent would not find the true solution, but MCMC will be able to.
Our model is
$$y=\beta_0 + \beta_1x + \beta_2x^2 + \beta_3x^3 + \beta_4x^4$$np.random.seed(4)
sigma=1.5
beta_true=[60,-1,-6,1,0.5]
x=np.arange(-5,4,step=0.1)
y_true=beta_true[0]+beta_true[1]*x+beta_true[2]*x**2+beta_true[3]*x**3+beta_true[4]*x**4
y_obs=[]
err=[]
for i in range(len(x)):
uncertainty=np.random.normal(loc=0,scale=sigma*np.sqrt(abs(y_true[i])),size=1)
y_obs.append(y_true[i]+uncertainty[0])
err.append(sigma*np.sqrt(abs(y_true[i])))
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
plt.plot(x,y_true,label='True Underlying Data')
plt.legend(loc='best')
We're going to try to model this data with a quartic, but we're intentionally going to pick a bad starting point so our gradient descent will get stuck in a local minimum
beta=[25,-14,1.5,0.1,0.4]
y_guess=beta[0]+beta[1]*x+beta[2]*x**2+beta[3]*x**3+beta[4]*x**4
plt.axis([x.min(),x.max(),0,200])
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
plt.plot(x,y_guess,label='Starting Point Model Fit')
plt.legend(loc='best')
Here's a link that talks about the gradient descent algorithm that I'm borrowing from https://towardsdatascience.com/gradient-descent-in-python-a0d07285742f
alpha=0.01 #learning rate
num_iter=100 #number of iterations
num_params=5 #number of fit coefficients
numpoints=len(y_obs) #number of data points
def calculate_cost(beta,X,Y):
'''
X - design matrix (numpoints,num_params)
beta - column vector of fit coefficients (num_params,1)
Y - column vector of observed data (numpoints,1)
'''
m=Y.shape[0]
model=np.dot(X,beta) #dot product of the design matrix and our coefficient vector
cost=(0.5*m)*np.sum(np.square(model-Y))
return(cost)
def gradient_descent(beta,X,Y,learning_rate,iterations):
m=Y.shape[0]
cost_history=np.zeros(iterations)
beta_history=np.zeros((iterations+1,num_params))
beta_history[0,:]=beta[:,0]
for i in range(iterations):
model=np.dot(X,beta)
#beta_0=beta_0-(1/m)*alpha*sum{(model_i-data_i)*X_0,i}
sums=np.zeros(num_params)
#calculate the sums
for j in range(num_params):
for k in range(numpoints):
sums[j]+=(model[k,0]-Y[k,0])*X[k,j]
#update coefficient parameters
for j in range(num_params):
beta[j,0]=beta[j,0]-(1/m)*learning_rate*sums[j]
beta_history[i+1,:]=beta[:,0]
cost_history[i]=calculate_cost(beta,X,Y)
return(beta,cost_history,beta_history)
beta_start=np.asarray([25,-14,1.5,0.1,0.4]).reshape((5,1))
X=np.zeros((numpoints,num_params))
x_data=np.arange(-5,4,step=0.1)
X[:,0]=1
X[:,1]=x_data
X[:,2]=x_data**2
X[:,3]=x_data**3
X[:,4]=x_data**4
Y=np.asarray(y_obs).reshape((numpoints,1))
print("X shape is "+str(X.shape))
print("Beta shape is "+str(beta_start.shape))
print("Y shape is "+str(Y.shape))
beta_model,cost_history,beta_history=gradient_descent(beta_start,X,Y,learning_rate=alpha,iterations=num_iter)
print(cost_history.shape)
print(beta_history.shape)
plt.axis([x.min(),x.max(),-200,400])
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
for i in range(3):
beta=beta_history[i,:]
y_guess=beta[0]+beta[1]*x+beta[2]*x**2+beta[3]*x**3+beta[4]*x**4
plt.plot(x,y_guess,label='Model Fit '+str(i))
plt.legend(loc='best')
An explanation of the Metropolis-Hastings MCMC algorithm from https://en.wikipedia.org/wiki/Metropolis%E2%80%93Hastings_algorithm
for our cases, f(x) is given by the likelihood $L(x)$ which is $-\log_{10}(\chi^2(x))$
def calculate_chi2(beta,X,Y,Yerr):
'''
beta is our column vector of fit coefficients [num_params,1]
X is our design matrix [numpoints,num_params]
Y is our column vector of data values [numpoints,1]
errors is a column vector of error values [numpoints,1]
'''
model=np.dot(X,beta)[:,0]
data=Y[:,0]
errors=Yerr[:,0]
chi2=0
for i in range(len(data)):
chi2+=(data[i]-model[i])**2/(errors[i]**2)
return chi2
#let's make sure low chi2 values correspond to high likelihoods
chi2=np.arange(0.1,100,step=0.1)
likelihood=-np.log(chi2)
plt.plot(chi2,likelihood)
plt.xlabel('Chi2')
plt.ylabel('Likelihood')
def MCMC(beta,X,Y,Yerr,iterations,sigma,likelihood_history_old=None,beta_history_old=None):
'''
beta - column vector of fit coefficients (num_params,1)
X - design matrix (numpoints,num_params)
Y - column vector of independent variable (numpoints,1)
Yerr - column vector of errors for independent variable (numpoints,1)
iterations - number of MCMC iterations (int)
sigma - list of gaussian widths for new parameter selection len(sigma)=num_params
likelihood_history_old - old likelihood history from previous MCMC training
beta_history_old - old beta history from previous MCMC training
'''
#if it's not your first training run, the function will append the new history onto the old ones
if likelihood_history_old is not None:
assert(beta_history_old is not None),"Must provide beta_history if you provide likelihood_history"
if beta_history_old is not None:
assert(likelihood_history_old is not None),"Must provide likelihood_history if you provide beta_history"
if likelihood_history_old is not None:
skip_index=len(likelihood_history_old)
likelihood_history=np.zeros(len(likelihood_history_old)+iterations)
likelihood_history[:skip_index]=likelihood_history_old
beta_history=np.zeros((len(likelihood_history_old)+iterations,num_params))
beta_history[:skip_index,:]=beta_history_old
else:
skip_index=0
beta_history=np.zeros((iterations,num_params))
likelihood_history=np.zeros(iterations)
#instead of choosing a random point in parameter space, we're going to use our starting point
for i in range(iterations):
likelihood_old=-np.log(calculate_chi2(beta,X,Y,Yerr))
#generate a new point by drawing from a gaussian for each parameter
new_beta=np.zeros((num_params,1))
for j in range(num_params):
new_beta[j,0]=np.random.normal(loc=beta[j,0],scale=sigma[j],size=1)
#calculate the acceptance ratio
likelihood_new=-np.log(calculate_chi2(new_beta,X,Y,Yerr))
alpha=likelihood_new/likelihood_old
#accept or reject
if alpha <= np.random.random_sample(size=1):
#accept
beta=new_beta
else:
#reject
pass
beta_history[skip_index+i,:]=beta[:,0]
likelihood_history[skip_index+i]=-np.log(calculate_chi2(beta,X,Y,Yerr))
return beta,beta_history,likelihood_history
numpoints=len(y_obs)
num_params=5
beta_start=np.asarray([25,-14,1.5,0.1,0.4]).reshape((5,1))
#beta_start=beta_fit
X=np.zeros((numpoints,num_params))
x_data=np.arange(-5,4,step=0.1)
X[:,0]=1
X[:,1]=x_data
X[:,2]=x_data**2
X[:,3]=x_data**3
X[:,4]=x_data**4
Y=np.asarray(y_obs).reshape((numpoints,1))
Yerr=np.asarray(err).reshape((numpoints,1))
#first iteration of training
beta_fit,beta_history,likelihood_history=MCMC(beta_start,X,Y,Yerr,iterations=100000,sigma=[5.0,2.0,2.0,2.0,2.0])
#all subsequent iterations of training
beta_fit,beta_history,likelihood_history=MCMC(beta_fit,X,Y,Yerr,iterations=1000000,sigma=[5.0,2.0,2.0,2.0,2.0],
likelihood_history_old=likelihood_history,beta_history_old=beta_history)
epochs=np.arange(len(likelihood_history))
plt.plot(epochs,likelihood_history)
plt.xlabel("Epochs")
plt.ylabel('log Likelihood')
plt.plot(epochs,beta_history[:,0])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_0$')
plt.plot(epochs,beta_history[:,1])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_1$')
plt.plot(epochs,beta_history[:,2])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_2$')
plt.plot(epochs,beta_history[:,3])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_3$')
plt.plot(epochs,beta_history[:,4])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_4$')
plt.axis([x.min(),x.max(),0,200])
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
model=np.dot(X,beta_fit)[:,0]
plt.plot(x,model,label='Model Fit')
plt.legend(loc='best')
beta_true=np.asarray([60,-1,-6,1,0.5]).reshape(num_params,1)
likelihood_true=-np.log(calculate_chi2(beta_true,X,Y,Yerr))
print("True log Likelihood is "+str(likelihood_true)[:5])
print("MCMC log Likelihood is "+str(likelihood_history[-1])[:5])
print("True Beta Values are "+str(beta_true[:,0]))
print("MCMC Beta Values are "+str(beta_fit[:,0]))