Optimization#
In this chapter, we will present variants of the Gradient Descent optimization strategy and show how they can be used to optimize neural network parameters.
Let us start with the basic Gradient Descent algorithm and its limitations.
Algorithm 1 (Gradient Descent)
Input: A dataset
Initialize model parameters
for
for
Compute prediction
Compute gradient
Compute overall gradient
Update parameters
based on
The typical update rule for the parameters
where
As one can see in the previous algorithm, when performing gradient descent, model parameters are updated once per epoch, which means a full pass over the whole dataset is required before the update can occur. When dealing with large datasets, this is a strong limitation, which motivates the use of stochastic variants.
Stochastic Gradient Descent (SGD)#
The idea behind the Stochastic Gradient Descent algorithm is to get cheap estimates for the quantity
where
is used as an estimator for
Algorithm 2 (Stochastic Gradient Descent)
Input: A dataset
Initialize model parameters
for
for
Draw minibatch
as a random sample of size fromfor
Compute prediction
Compute gradient
Compute minibatch-level gradient
Update parameters
based on
As a consequence, when using SGD, parameter updates are more frequent, but they are “noisy” since they are based on an minibatch estimation of the gradient instead of relying on the true gradient, as illustrated below:
Show code cell source
import numpy as np
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
import scipy.optimize as optim
from notebook_utils import prepare_notebook_graphics
prepare_notebook_graphics()
def grad(X, y, alpha, lambd):
p = np.exp(-y * X.dot(alpha))
d = - X.T.dot(p * y / (1 + p)) + lambd * alpha
return d
def norm(x):
return np.sqrt(np.sum(x ** 2))
def cost(X, y, alpha, lambd):
p = np.exp(-y * X.dot(alpha))
return np.sum(np.log(1 + p)) + .5 * lambd * norm(alpha) ** 2
# TODO: 1/n pour pas que le SGD fasse nimp
def optim_gd(X, y, alpha_init, n_epochs, lambd, rho):
alphas = [alpha_init]
for _ in range(n_epochs):
d = - grad(X, y, alphas[-1], lambd)
alphas.append(alphas[-1] + rho * d)
return np.concatenate(alphas, axis=0).reshape((-1, alpha_init.shape[0]))
def optim_sgd(X, y, alpha_init, n_epochs, lambd, rho, minibatch_size):
alphas = [alpha_init]
for i in range(n_epochs):
for j in range(X.shape[0] // minibatch_size):
scaled_lambda = lambd / (X.shape[0] // minibatch_size)
indices_minibatch = np.random.randint(X.shape[0], size=minibatch_size)
X_minibatch = X[indices_minibatch]
y_minibatch = y[indices_minibatch]
d = - grad(X_minibatch, y_minibatch, alphas[-1], scaled_lambda)
alphas.append(alphas[-1] + rho * d)
return np.concatenate(alphas, axis=0).reshape((-1, alpha_init.shape[0]))
def stretch_to_range(lim, sz_range):
middle = (lim[0] + lim[1]) / 2
return [middle - sz_range / 2, middle + sz_range / 2]
def get_lims(*alphas_list):
xlims = [
min([alphas[:, 0].min() for alphas in alphas_list]) - 1,
max([alphas[:, 0].max() for alphas in alphas_list]) + 1
]
ylims = [
min([alphas[:, 1].min() for alphas in alphas_list]) - 1,
max([alphas[:, 1].max() for alphas in alphas_list]) + 1
]
if xlims[1] - xlims[0] > ylims[1] - ylims[0]:
ylims = stretch_to_range(ylims, xlims[1] - xlims[0])
else:
xlims = stretch_to_range(xlims, ylims[1] - ylims[0])
return xlims, ylims
def gen_anim(X, y, alphas_gd, alphas_sgd, alpha_star, lambd, xlims, ylims, n_steps_per_epoch, gen_video=True):
global lines_alphas
n = 40
nn = n * n
xv, yv = np.meshgrid(np.linspace(xlims[0], xlims[1], n),
np.linspace(ylims[0], ylims[1], n))
xvisu = np.concatenate((xv.ravel()[:, None], yv.ravel()[:, None]), axis=1)
pv = np.zeros(nn)
for i in range(nn):
pv[i] = cost(X, y, xvisu[i], lambd)
P = pv.reshape((n,n))
fig = plt.figure(figsize=(13, 6))
axes = [plt.subplot(1, 2, i + 1) for i in range(2)]
lines_alphas = []
texts = []
for ax, alphas, title in zip(axes,
[alphas_gd, alphas_sgd],
["Gradient Descent", "Stochastic Gradient Descent"]):
ax.contour(xv, yv, P, alpha=0.5)
ax.plot(alphas[0, 0], alphas[0, 1], 'ko', fillstyle='none')
line_alphas, = ax.plot(alphas[:1, 0], alphas[:1, 1], marker="x")
lines_alphas.append(line_alphas)
ax.plot(alpha_star[0:1], alpha_star[1:2], '+r')
ax.set_xlabel("$w_0$")
ax.set_ylabel("$w_1$")
ax.set_xlim(xlims)
ax.set_ylim(ylims)
ax.set_title(title)
text_epoch = ax.text(0.7 * xlims[1], 0.8 * ylims[1], s="Epoch 0")
texts.append(text_epoch)
def animate(i):
global lines_alphas
for line_alphas, text_epoch, alphas in zip(lines_alphas, texts, [alphas_gd, alphas_sgd]):
line_alphas.set_xdata(alphas[:i, 0])
line_alphas.set_ydata(alphas[:i, 1])
text_epoch.set_text(f"Epoch {i // n_steps_per_epoch}")
return lines_alphas + texts
if gen_video:
ani = animation.FuncAnimation(fig, animate, interval=500, blit=False, save_count=len(alphas_gd))
return HTML(ani.to_jshtml())
else:
animate(len(alphas_gd))
return fig
# Data
np.random.seed(0)
X = np.random.rand(20, 2) * 3 - 1.5
y = (X[:, 0] > 0.).astype(int)
y[y == 0] = -1
# Optim
lambd = .1
rho = 2e-1
alpha_init = np.array([1., -3.])
n_epochs = 10
minibatch_size = 4
res_optim = optim.minimize(fun=lambda alpha: cost(X, y, alpha, lambd),
x0=alpha_init,
jac=lambda alpha: grad(X, y, alpha, lambd))
alpha_star = res_optim["x"]
alphas_gd = optim_gd(X, y, alpha_init, n_epochs, lambd, rho)
alphas_sgd = optim_sgd(X, y, alpha_init, n_epochs, lambd, rho, minibatch_size)
# Visualization
xlims, ylims = get_lims(alphas_gd, alphas_sgd, np.array([alpha_star]))
is_html_output = True
viz = gen_anim(X, y,
np.repeat(alphas_gd, 20 // minibatch_size, axis=0), alphas_sgd,
alpha_star, lambd, xlims, ylims,
n_steps_per_epoch=20 // minibatch_size, gen_video=is_html_output)
plt.close()
viz
Apart from implying more frequent parameter updates, SGD has an extra benefit in terms of optimization, which is key for neural networks. Indeed, as one can see below, contrary to what we had in the Perceptron case, the MSE loss (and the same applies for the logistic loss) is no longer convex in the model parameters as soon as the model has at least one hidden layer:
Show code cell source
def sigmoid(x):
return 1. / (1. + np.exp(-x))
def model_forward_loss(weights, biases, X, y):
outputs = X
for w, b in zip(weights, biases):
outputs = sigmoid(outputs @ w + b)
loss = np.mean((outputs - y) ** 2)
loss += .0001 * np.sum([(w ** 2).sum() for w in weights])
return loss
np.random.seed(0)
w0 = np.linspace(-5, 5, 100)
X = np.random.randn(150, 6)
y = np.array([0] * 75 + [1] * 75)
weights = [
np.random.randn(6, 20),
np.random.randn(20, 1)
]
biases = [
np.random.randn(1, 20),
np.random.randn(1, 1)
]
losses = []
for wi in w0:
weights[0][3, 9] = wi
losses.append(model_forward_loss(weights, biases, X, y))
plt.plot(w0, losses)
plt.grid('on')
plt.xlabel('$w$')
plt.ylabel('$\mathcal{L}$');
Gradient Descent is known to suffer from local optima, and such loss landscapes are a serious problem for GD. On the other hand, Stochastic Gradient Descent is likely to benefit from noisy gradient estimations to escape local minima.
A note on Adam#
Adam [Kingma and Ba, 2015] is a variant of the Stochastic Gradient Descent method. It differs in the definition of the steps to be performed at each parameter update.
First, it uses what is called momentum, which basically consists in relying on past gradient updates to smooth out the trajectory in parameter space during optimization. An interactive illustration of momentum can be found in [Goh, 2017].
The resulting plugin replacement for the gradient is:
When
Another important difference between SGD and the Adam variant consists in using an adaptive learning rate.
In other words, instead of using the same learning rate
where
Here also, the
Overall, the Adam update rule is:
The curse of depth#
Let us consider the following neural network:
and let us recall that, at a given layer
where
In order to perform (stochastic) gradient descent, gradients of the loss with respect to model parameters need to be computed.
By using the chain rule, these gradients can be expressed as:
There are important insights to grasp here.
First, one should notice that weights that are further from the output of the model inherit gradient rules made of more terms. As a consequence, when some of these terms get smaller and smaller, there is a higher risk for those weights that their gradients collapse to 0, this is called the vanishing gradient effect, which is a very common phenomenon in deep neural networks (i.e. those networks made of many layers).
Second, some terms are repeated in these formulas, and in general, terms of the form
Let us inspect what the derivatives of standard activation functions look like:
Show code cell source
import tensorflow as tf
def tanh(x):
return 2. / (1. + tf.exp(-2 * x)) - 1.
def sigmoid(x):
return 1. / (1. + tf.exp(-x))
x = tf.Variable(tf.linspace(-4, 4, 100))
with tf.GradientTape() as tape_grad:
tan_x = tanh(x)
with tf.GradientTape() as tape_sig:
sig_x = sigmoid(x)
with tf.GradientTape() as tape_relu:
relu_x = tf.nn.relu(x)
grad_tanh_x = tape_grad.gradient(tan_x, x)
grad_sig_x = tape_sig.gradient(sig_x, x)
grad_relu_x = tape_relu.gradient(relu_x, x)
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.plot(x.numpy(), grad_tanh_x)
plt.grid('on')
plt.ylim([-.1, 1.1])
plt.title("tanh'(x)")
plt.subplot(1, 3, 2)
plt.plot(x.numpy(), grad_sig_x)
plt.grid('on')
plt.ylim([-.1, 1.1])
plt.title("sigmoid'(x)")
plt.subplot(1, 3, 3)
plt.plot(x.numpy(), grad_relu_x)
plt.grid('on')
plt.ylim([-.1, 1.1])
plt.title("ReLU'(x)")
plt.tight_layout();
One can see that the derivative of ReLU has a wider range of input values for which it is non-zero (typically the whole range of positive input values) than its competitors, which makes it a very attractive candidate activation function for deep neural networks, as we have seen that the
Wrapping things up in keras
#
In keras
, loss and optimizer information are passed at compile time:
import keras_core as keras
from keras.layers import Dense, InputLayer
from keras.models import Sequential
model = Sequential([
InputLayer(input_shape=(10, )),
Dense(units=20, activation="relu"),
Dense(units=3, activation="softmax")
])
model.summary()
Using TensorFlow backend
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 20) 220
dense_1 (Dense) (None, 3) 63
=================================================================
Total params: 283 (1.11 KB)
Trainable params: 283 (1.11 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
model.compile(loss="categorical_crossentropy", optimizer="adam")
In terms of losses:
"mse"
is the mean squared error loss,"binary_crossentropy"
is the logistic loss for binary classification,"categorical_crossentropy"
is the logistic loss for multi-class classification.
The optimizers defined in this section are available as "sgd"
and "adam"
.
In order to get control over optimizer hyper-parameters, one can alternatively use the following syntax:
from keras.optimizers import Adam, SGD
# Not a very good idea to tune beta_1
# and beta_2 parameters in Adam
adam_opt = Adam(learning_rate=0.001,
beta_1=0.9, beta_2=0.9)
# In order to use SGD with a custom learning rate:
# sgd_opt = SGD(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=adam_opt)
Data preprocessing#
In practice, for the model fitting phase to behave well, it is important to scale the input features. In the following example, we will compare two trainings of the same model, with similar initialization and the only difference between both will be whether input data is center-reduced or left as-is.
import pandas as pd
from keras.utils import to_categorical
iris = pd.read_csv("../data/iris.csv", index_col=0)
iris = iris.sample(frac=1)
y = to_categorical(iris["target"])
X = iris.drop(columns=["target"])
from keras.layers import Dense, InputLayer
from keras.models import Sequential
from keras.utils import set_random_seed
set_random_seed(0)
model = Sequential([
InputLayer(input_shape=(4, )),
Dense(units=256, activation="relu"),
Dense(units=256, activation="relu"),
Dense(units=256, activation="relu"),
Dense(units=3, activation="softmax")
])
n_epochs = 100
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
h = model.fit(X, y, epochs=n_epochs, batch_size=30, verbose=0)
Let us now standardize our data and compare performance:
X -= X.mean(axis=0)
X /= X.std(axis=0)
set_random_seed(0)
model = Sequential([
InputLayer(input_shape=(4, )),
Dense(units=256, activation="relu"),
Dense(units=256, activation="relu"),
Dense(units=256, activation="relu"),
Dense(units=3, activation="softmax")
])
n_epochs = 100
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
h_standardized = model.fit(X, y, epochs=n_epochs, batch_size=30, verbose=0)
Show code cell source
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(np.arange(1, n_epochs + 1), h.history["loss"], label="Without data standardization")
plt.plot(np.arange(1, n_epochs + 1), h_standardized.history["loss"], label="With data standardization")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend();
plt.subplot(1, 2, 2)
plt.plot(np.arange(1, n_epochs + 1), h.history["accuracy"], label="Without data standardization")
plt.plot(np.arange(1, n_epochs + 1), h_standardized.history["accuracy"], label="With data standardization")
plt.ylabel("Accuracy")
plt.xlabel("Epochs");
References#
- Goh17
Gabriel Goh. Why momentum really works. Distill, 2017. URL: http://distill.pub/2017/momentum.
- KB15
Diederik P. Kingma and Jimmy Ba. Adam: a method for stochastic optimization. In Yoshua Bengio and Yann LeCun, editors, ICLR. 2015.