Ex Data, Scientia

Programming the backpropagation algorithm from scratch

```set.seed(0)

inds = sample(seq(1,nrow(iris),1), nrow(iris), replace = F)
iris = iris[inds,]

my_iris = iris[,1:4]

for(i in 1:4){
my_iris[,i] = (my_iris[,i] - mean(my_iris[,i])) / sd(my_iris[,i])
}

y_mat = matrix(nrow = nrow(iris), ncol = length(unique(iris\$Species)))
y_mat[,] = 0
y_mat[iris\$Species == unique(iris\$Species)[1],1] = 1
y_mat[iris\$Species == unique(iris\$Species)[2],2] = 1
y_mat[iris\$Species == unique(iris\$Species)[3],3] = 1

### sampling from the Glorot-uniform distribution (Glorot & Bengio, 2010) ###

lim_l1 = sqrt(6/(4+3)) # range for parameter values of the hidden layer are given by 6 divided by the number of input dimensions (4) added to the number of hidden nodes (3)
lim_l2 = sqrt(6/(3+3)) # range for parameter values of the output layer are given by 6 divided by the number of hidden nodes (4) added to the number of output nodes (3)

x1a1 = runif(1, min = -lim_l1, max = lim_l1) # parameter is sampled from a uniform distribution defined by the limits calculated above
x1b1 = runif(1, min = -lim_l1, max = lim_l1)
x1c1 = runif(1, min = -lim_l1, max = lim_l1)
x1d1 = runif(1, min = -lim_l1, max = lim_l1)
x1e1 = runif(1, min = -lim_l1, max = lim_l1)
x1a2 = runif(1, min = -lim_l1, max = lim_l1)
x1b2 = runif(1, min = -lim_l1, max = lim_l1)
x1c2 = runif(1, min = -lim_l1, max = lim_l1)
x1d2 = runif(1, min = -lim_l1, max = lim_l1)
x1e2 = runif(1, min = -lim_l1, max = lim_l1)
x1a3 = runif(1, min = -lim_l1, max = lim_l1)
x1b3 = runif(1, min = -lim_l1, max = lim_l1)
x1c3 = runif(1, min = -lim_l1, max = lim_l1)
x1d3 = runif(1, min = -lim_l1, max = lim_l1)
x1e3 = runif(1, min = -lim_l1, max = lim_l1)

x2a1 = runif(1, min = -lim_l2, max = lim_l2)
x2b1 = runif(1, min = -lim_l2, max = lim_l2)
x2c1 = runif(1, min = -lim_l2, max = lim_l2)
x2d1 = runif(1, min = -lim_l2, max = lim_l2)
x2a2 = runif(1, min = -lim_l2, max = lim_l2)
x2b2 = runif(1, min = -lim_l2, max = lim_l2)
x2c2 = runif(1, min = -lim_l2, max = lim_l2)
x2d2 = runif(1, min = -lim_l2, max = lim_l2)
x2a3 = runif(1, min = -lim_l2, max = lim_l2)
x2b3 = runif(1, min = -lim_l2, max = lim_l2)
x2c3 = runif(1, min = -lim_l2, max = lim_l2)
x2d3 = runif(1, min = -lim_l2, max = lim_l2)

### end sampling from the Glorot-uniform distribution ###

### initialization of the Adam optimizer ###

lr = 1e-3 # a lower learning rate leads to better performance here

beta_1 = sample(seq(0,1,length.out = 1000), 1, replace = F)
beta_2 = sample(seq(0,1,length.out = 1000), 1, replace = F)
eps = 1e-5

m = 0
v = 0 # beta_1, beta_2, eps, m and v are all parameters of the Adam optimizer. For more details, see Kingma & Ba (2017)

### end initialization of the Adam optimizer ###

ce_list = vector('list') # initializing list recording the loss over optimization steps

for(k in 1:50){
for(j in 1:nrow(my_iris)){
in1a = my_iris[j,1]
in1b = my_iris[j,2]
in1c = my_iris[j,3]
in1d = my_iris[j,4]

y_1 = y_mat[j,1]
y_2 = y_mat[j,2]
y_3 = y_mat[j,3]

l1_1 = 1/(1+exp(-(x1a1 * in1a + x1b1 * in1b + x1c1 * in1c + x1d1 * in1d + x1e1 * 1)))
l1_2 = 1/(1+exp(-(x1a2 * in1a + x1b2 * in1b + x1c2 * in1c + x1d2 * in1d + x1e2 * 1)))
l1_3 = 1/(1+exp(-(x1a3 * in1a + x1b3 * in1b + x1c3 * in1c + x1d3 * in1d + x1e3 * 1)))

l2_1 = x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1
l2_2 = x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1
l2_3 = x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1

pred = which(c(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) == max(c(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3)))))

ce = -(y_1 * log(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) +
y_2 * log(exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) +
y_3 * log(exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3))))
ce_list[[length(ce_list)+1]] = ce # recording the loss

print(ce)

d_ce_l2 = deriv(~ -(y_1 * log(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) +
y_2 * log(exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) +
y_3 * log(exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3)))), c('l2_1','l2_2','l2_3'))

d_l2_1_par = deriv(~ x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1, c('x2a1', 'x2b1', 'x2c1', 'x2d1'))
d_l2_1_l1 = deriv(~ x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1, c('l1_1', 'l1_2', 'l1_3'))

d_l2_2_par = deriv(~ x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1, c('x2a2', 'x2b2', 'x2c2', 'x2d2'))
d_l2_2_l1 = deriv(~ x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1, c('l1_1', 'l1_2', 'l1_3'))

d_l2_3_par = deriv(~ x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1, c('x2a3', 'x2b3', 'x2c3', 'x2d3'))
d_l2_3_l1 = deriv(~ x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1, c('l1_1', 'l1_2', 'l1_3'))

d_l1_1_par = deriv(~ 1/(1+exp(-(x1a1 * in1a + x1b1 * in1b + x1c1 * in1c + x1d1 * in1d + x1e1 * 1))), c('x1a1', 'x1b1', 'x1c1', 'x1d1', 'x1e1'))
d_l1_2_par = deriv(~ 1/(1+exp(-(x1a2 * in1a + x1b2 * in1b + x1c2 * in1c + x1d2 * in1d + x1e2 * 1))), c('x1a2', 'x1b2', 'x1c2', 'x1d2', 'x1e2'))
d_l1_3_par = deriv(~ 1/(1+exp(-(x1a3 * in1a + x1b3 * in1b + x1c3 * in1c + x1d3 * in1d + x1e3 * 1))), c('x1a3', 'x1b3', 'x1c3', 'x1d3', 'x1e3'))

### using the Adam optimizer ###

m = beta_1 * m + (1 - beta_1) * all_grads
v = beta_2 * v + (1 - beta_2) * all_grads**2 # parameters m and v are updated based on the current loss gradients

m_hat = m / (1 - beta_1**length(ce_list))
v_hat = v / (1 - beta_2**length(ce_list)) # two new parameters, m_hat and v_hat, are computed from the other
# Adam parameters and from the number of optimization steps (which corresponds to the length of the list recording the loss)