In [4]:
import numpy as np
from matplotlib import pyplot as plt

FONTSIZE=18
plt.rcParams['figure.figsize']=(15,8)
plt.rcParams['font.size']=FONTSIZE


# In this notebook I'll try to build a Metropolis-Hastings sampler and explain why MCMC in general is useful¶

Some things we'll neeed to do

• Produce some data we want to model
• Produce a model for that data
• Understand $\chi^2$
• Fit the data with gradient descent
• Understand Bayes' rule
• Use MCMC to calculate point estimates and uncertainties for model parameters

# Produce data we want to model¶

Here I want some data where our model will hit a local minimum. We'll make some quartic data and intentionally give the model a bad starting point. For this contrived example, gradient descent would not find the true solution, but MCMC will be able to.

Our model is

$$y=\beta_0 + \beta_1x + \beta_2x^2 + \beta_3x^3 + \beta_4x^4$$
In [257]:
np.random.seed(4)
sigma=1.5
beta_true=[60,-1,-6,1,0.5]
x=np.arange(-5,4,step=0.1)
y_true=beta_true[0]+beta_true[1]*x+beta_true[2]*x**2+beta_true[3]*x**3+beta_true[4]*x**4
y_obs=[]
err=[]
for i in range(len(x)):
uncertainty=np.random.normal(loc=0,scale=sigma*np.sqrt(abs(y_true[i])),size=1)
y_obs.append(y_true[i]+uncertainty[0])
err.append(sigma*np.sqrt(abs(y_true[i])))

plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
plt.plot(x,y_true,label='True Underlying Data')
plt.legend(loc='best')

Out[257]:
<matplotlib.legend.Legend at 0x7fe8609a4e50>

# Produce a model for the data¶

We're going to try to model this data with a quartic, but we're intentionally going to pick a bad starting point so our gradient descent will get stuck in a local minimum

In [260]:
beta=[25,-14,1.5,0.1,0.4]
y_guess=beta[0]+beta[1]*x+beta[2]*x**2+beta[3]*x**3+beta[4]*x**4

plt.axis([x.min(),x.max(),0,200])
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
plt.plot(x,y_guess,label='Starting Point Model Fit')
plt.legend(loc='best')

Out[260]:
<matplotlib.legend.Legend at 0x7fe86088f250>

# Fit the data with gradient descent¶

In [328]:
alpha=0.01 #learning rate
num_iter=100 #number of iterations
num_params=5 #number of fit coefficients
numpoints=len(y_obs) #number of data points

def calculate_cost(beta,X,Y):
'''
X - design matrix (numpoints,num_params)
beta - column vector of fit coefficients (num_params,1)
Y - column vector of observed data (numpoints,1)
'''
m=Y.shape[0]

model=np.dot(X,beta)  #dot product of the design matrix and our coefficient vector

cost=(0.5*m)*np.sum(np.square(model-Y))

return(cost)

m=Y.shape[0]
cost_history=np.zeros(iterations)
beta_history=np.zeros((iterations+1,num_params))
beta_history[0,:]=beta[:,0]

for i in range(iterations):

model=np.dot(X,beta)

#beta_0=beta_0-(1/m)*alpha*sum{(model_i-data_i)*X_0,i}
sums=np.zeros(num_params)
#calculate the sums
for j in range(num_params):
for k in range(numpoints):
sums[j]+=(model[k,0]-Y[k,0])*X[k,j]

#update coefficient parameters
for j in range(num_params):
beta[j,0]=beta[j,0]-(1/m)*learning_rate*sums[j]

beta_history[i+1,:]=beta[:,0]
cost_history[i]=calculate_cost(beta,X,Y)

return(beta,cost_history,beta_history)

beta_start=np.asarray([25,-14,1.5,0.1,0.4]).reshape((5,1))
X=np.zeros((numpoints,num_params))
x_data=np.arange(-5,4,step=0.1)
X[:,0]=1
X[:,1]=x_data
X[:,2]=x_data**2
X[:,3]=x_data**3
X[:,4]=x_data**4
Y=np.asarray(y_obs).reshape((numpoints,1))
print("X shape is "+str(X.shape))
print("Beta shape is "+str(beta_start.shape))
print("Y shape is "+str(Y.shape))

X shape is (90, 5)
Beta shape is (5, 1)
Y shape is (90, 1)

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:16: RuntimeWarning: overflow encountered in double_scalars
app.launch_new_instance()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:16: RuntimeWarning: overflow encountered in square
app.launch_new_instance()

In [329]:
print(cost_history.shape)
print(beta_history.shape)

(100,)
(101, 5)

In [332]:
plt.axis([x.min(),x.max(),-200,400])
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
for i in range(3):

beta=beta_history[i,:]
y_guess=beta[0]+beta[1]*x+beta[2]*x**2+beta[3]*x**3+beta[4]*x**4

plt.plot(x,y_guess,label='Model Fit '+str(i))
plt.legend(loc='best')


Out[332]:
<matplotlib.legend.Legend at 0x7fe86012ea50>

# Gradient Descent is not able to handle a problem like this with multiple parameters and a local minimum¶

An explanation of the Metropolis-Hastings MCMC algorithm from https://en.wikipedia.org/wiki/Metropolis%E2%80%93Hastings_algorithm

• choose an arbitrary point in parameter space as your starting point, also choose an arbitrary probability density that suggests a next sample value $x$ given the previous value $y$
• let's let $g(x|y)$ be a Gaussian centered at $y$, points close to $y$ are more likely to be sampled next
• for each iteration $t$:
• generate a candidate $x'$ for the next sample by drawing from the distribution $g(x'|x_t)$
• calculate the acceptance ratio $\alpha=\frac{f(x')}{f(x_t)}$
• accept or reject:
• generate a uniform random number on $u$ on $[0,1]$
• if $u \leq \alpha$, then accept the candidate by setting $x_{t+1}=x'$
• if $u > \alpha$, then reject the candidate by setting $x_{t+1}=x_t$

for our cases, f(x) is given by the likelihood $L(x)$ which is $-\log_{10}(\chi^2(x))$

# Understand $\chi^2$¶

$$\chi^2=\sum \frac{(O_i-E_i)^2}{\sigma_i}$$
In [356]:
def calculate_chi2(beta,X,Y,Yerr):
'''
beta is our column vector of fit coefficients [num_params,1]
X is our design matrix [numpoints,num_params]
Y is our column vector of data values [numpoints,1]
errors is a column vector of error values [numpoints,1]
'''

model=np.dot(X,beta)[:,0]
data=Y[:,0]
errors=Yerr[:,0]

chi2=0
for i in range(len(data)):

chi2+=(data[i]-model[i])**2/(errors[i]**2)

return chi2

In [425]:
#let's make sure low chi2 values correspond to high likelihoods
chi2=np.arange(0.1,100,step=0.1)
likelihood=-np.log(chi2)
plt.plot(chi2,likelihood)
plt.xlabel('Chi2')
plt.ylabel('Likelihood')

Out[425]:
Text(0, 0.5, 'Likelihood')
In [446]:
def MCMC(beta,X,Y,Yerr,iterations,sigma,likelihood_history_old=None,beta_history_old=None):

'''
beta - column vector of fit coefficients (num_params,1)
X - design matrix (numpoints,num_params)
Y - column vector of independent variable (numpoints,1)
Yerr - column vector of errors for independent variable (numpoints,1)
iterations - number of MCMC iterations (int)
sigma - list of gaussian widths for new parameter selection len(sigma)=num_params
likelihood_history_old - old likelihood history from previous MCMC training
beta_history_old - old beta history from previous MCMC training
'''

#if it's not your first training run, the function will append the new history onto the old ones
if likelihood_history_old is not None:
assert(beta_history_old is not None),"Must provide beta_history if you provide likelihood_history"
if beta_history_old is not None:
assert(likelihood_history_old is not None),"Must provide likelihood_history if you provide beta_history"

if likelihood_history_old is not None:
skip_index=len(likelihood_history_old)
likelihood_history=np.zeros(len(likelihood_history_old)+iterations)
likelihood_history[:skip_index]=likelihood_history_old
beta_history=np.zeros((len(likelihood_history_old)+iterations,num_params))
beta_history[:skip_index,:]=beta_history_old

else:
skip_index=0
beta_history=np.zeros((iterations,num_params))
likelihood_history=np.zeros(iterations)

#instead of choosing a random point in parameter space, we're going to use our starting point
for i in range(iterations):

likelihood_old=-np.log(calculate_chi2(beta,X,Y,Yerr))

#generate a new point by drawing from a gaussian for each parameter
new_beta=np.zeros((num_params,1))
for j in range(num_params):
new_beta[j,0]=np.random.normal(loc=beta[j,0],scale=sigma[j],size=1)

#calculate the acceptance ratio
likelihood_new=-np.log(calculate_chi2(new_beta,X,Y,Yerr))
alpha=likelihood_new/likelihood_old
#accept or reject
if alpha <= np.random.random_sample(size=1):
#accept
beta=new_beta
else:
#reject
pass
beta_history[skip_index+i,:]=beta[:,0]
likelihood_history[skip_index+i]=-np.log(calculate_chi2(beta,X,Y,Yerr))

return beta,beta_history,likelihood_history


In [457]:
numpoints=len(y_obs)
num_params=5

beta_start=np.asarray([25,-14,1.5,0.1,0.4]).reshape((5,1))
#beta_start=beta_fit
X=np.zeros((numpoints,num_params))
x_data=np.arange(-5,4,step=0.1)
X[:,0]=1
X[:,1]=x_data
X[:,2]=x_data**2
X[:,3]=x_data**3
X[:,4]=x_data**4
Y=np.asarray(y_obs).reshape((numpoints,1))
Yerr=np.asarray(err).reshape((numpoints,1))

#first iteration of training
beta_fit,beta_history,likelihood_history=MCMC(beta_start,X,Y,Yerr,iterations=100000,sigma=[5.0,2.0,2.0,2.0,2.0])

In [467]:
#all subsequent iterations of training
beta_fit,beta_history,likelihood_history=MCMC(beta_fit,X,Y,Yerr,iterations=1000000,sigma=[5.0,2.0,2.0,2.0,2.0],
likelihood_history_old=likelihood_history,beta_history_old=beta_history)

In [468]:
epochs=np.arange(len(likelihood_history))
plt.plot(epochs,likelihood_history)
plt.xlabel("Epochs")
plt.ylabel('log Likelihood')

Out[468]:
Text(0, 0.5, 'log Likelihood')
In [469]:
plt.plot(epochs,beta_history[:,0])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_0$')

Out[469]:
Text(0, 0.5, '$\\beta_0$')
In [470]:
plt.plot(epochs,beta_history[:,1])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_1$')

Out[470]:
Text(0, 0.5, '$\\beta_1$')
In [471]:
plt.plot(epochs,beta_history[:,2])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_2$')

Out[471]:
Text(0, 0.5, '$\\beta_2$')
In [472]:
plt.plot(epochs,beta_history[:,3])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_3$')

Out[472]:
Text(0, 0.5, '$\\beta_3$')
In [473]:
plt.plot(epochs,beta_history[:,4])
plt.xlabel('Epochs')
plt.ylabel(r'$\beta_4$')

Out[473]:
Text(0, 0.5, '$\\beta_4$')
In [474]:
plt.axis([x.min(),x.max(),0,200])
plt.errorbar(x,y_obs,err,fmt='o',label='Observed Data')
model=np.dot(X,beta_fit)[:,0]
plt.plot(x,model,label='Model Fit')
plt.legend(loc='best')

Out[474]:
<matplotlib.legend.Legend at 0x7fe853247950>

# Calculate Likelihood of True Model¶

In [475]:
beta_true=np.asarray([60,-1,-6,1,0.5]).reshape(num_params,1)
likelihood_true=-np.log(calculate_chi2(beta_true,X,Y,Yerr))
print("True log Likelihood is "+str(likelihood_true)[:5])
print("MCMC log Likelihood is "+str(likelihood_history[-1])[:5])

True log Likelihood is -4.52
MCMC log Likelihood is -4.97

In [476]:
print("True Beta Values are "+str(beta_true[:,0]))
print("MCMC Beta Values are "+str(beta_fit[:,0]))

True Beta Values are [60.  -1.  -6.   1.   0.5]
MCMC Beta Values are [67.89172523 -7.79351899 -9.29335058  1.71090797  0.73415528]

In [ ]: