ExDataScientia

Programming the backpropagation algorithm from scratch
set.seed(0)

inds = sample(seq(1,nrow(iris),1), nrow(iris), replace = F)
iris = iris[inds,]

my_iris = iris[,1:4]

for(i in 1:4){
  my_iris[,i] = (my_iris[,i] - mean(my_iris[,i])) / sd(my_iris[,i])
}

y_mat = matrix(nrow = nrow(iris), ncol = length(unique(iris$Species)))
y_mat[,] = 0
y_mat[iris$Species == unique(iris$Species)[1],1] = 1
y_mat[iris$Species == unique(iris$Species)[2],2] = 1
y_mat[iris$Species == unique(iris$Species)[3],3] = 1

### sampling from the Glorot-uniform distribution (Glorot & Bengio, 2010) ###

lim_l1 = sqrt(6/(4+3)) # range for parameter values of the hidden layer are given by 6 divided by the number of input dimensions (4) added to the number of hidden nodes (3)
lim_l2 = sqrt(6/(3+3)) # range for parameter values of the output layer are given by 6 divided by the number of hidden nodes (4) added to the number of output nodes (3)

x1a1 = runif(1, min = -lim_l1, max = lim_l1) # parameter is sampled from a uniform distribution defined by the limits calculated above
x1b1 = runif(1, min = -lim_l1, max = lim_l1)
x1c1 = runif(1, min = -lim_l1, max = lim_l1)
x1d1 = runif(1, min = -lim_l1, max = lim_l1)
x1e1 = runif(1, min = -lim_l1, max = lim_l1)
x1a2 = runif(1, min = -lim_l1, max = lim_l1)
x1b2 = runif(1, min = -lim_l1, max = lim_l1)
x1c2 = runif(1, min = -lim_l1, max = lim_l1)
x1d2 = runif(1, min = -lim_l1, max = lim_l1)
x1e2 = runif(1, min = -lim_l1, max = lim_l1)
x1a3 = runif(1, min = -lim_l1, max = lim_l1)
x1b3 = runif(1, min = -lim_l1, max = lim_l1)
x1c3 = runif(1, min = -lim_l1, max = lim_l1)
x1d3 = runif(1, min = -lim_l1, max = lim_l1)
x1e3 = runif(1, min = -lim_l1, max = lim_l1)

x2a1 = runif(1, min = -lim_l2, max = lim_l2)
x2b1 = runif(1, min = -lim_l2, max = lim_l2)
x2c1 = runif(1, min = -lim_l2, max = lim_l2)
x2d1 = runif(1, min = -lim_l2, max = lim_l2)
x2a2 = runif(1, min = -lim_l2, max = lim_l2)
x2b2 = runif(1, min = -lim_l2, max = lim_l2)
x2c2 = runif(1, min = -lim_l2, max = lim_l2)
x2d2 = runif(1, min = -lim_l2, max = lim_l2)
x2a3 = runif(1, min = -lim_l2, max = lim_l2)
x2b3 = runif(1, min = -lim_l2, max = lim_l2)
x2c3 = runif(1, min = -lim_l2, max = lim_l2)
x2d3 = runif(1, min = -lim_l2, max = lim_l2)

### end sampling from the Glorot-uniform distribution ###

### initialization of the Adam optimizer ###

lr = 1e-3 # a lower learning rate leads to better performance here

beta_1 = sample(seq(0,1,length.out = 1000), 1, replace = F)
beta_2 = sample(seq(0,1,length.out = 1000), 1, replace = F)
eps = 1e-5

m = 0
v = 0 # beta_1, beta_2, eps, m and v are all parameters of the Adam optimizer. For more details, see Kingma & Ba (2017)

### end initialization of the Adam optimizer ###

ce_list = vector('list') # initializing list recording the loss over optimization steps

for(k in 1:50){
  for(j in 1:nrow(my_iris)){
    in1a = my_iris[j,1]
    in1b = my_iris[j,2]
    in1c = my_iris[j,3]
    in1d = my_iris[j,4]
    
    y_1 = y_mat[j,1]
    y_2 = y_mat[j,2]
    y_3 = y_mat[j,3]
    
    l1_1 = 1/(1+exp(-(x1a1 * in1a + x1b1 * in1b + x1c1 * in1c + x1d1 * in1d + x1e1 * 1)))
    l1_2 = 1/(1+exp(-(x1a2 * in1a + x1b2 * in1b + x1c2 * in1c + x1d2 * in1d + x1e2 * 1)))
    l1_3 = 1/(1+exp(-(x1a3 * in1a + x1b3 * in1b + x1c3 * in1c + x1d3 * in1d + x1e3 * 1)))
    
    l2_1 = x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1
    l2_2 = x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1
    l2_3 = x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1
    
    pred = which(c(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
                    exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
                    exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) == max(c(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
                                                                              exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3)),
                                                                              exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3)))))
    
    ce = -(y_1 * log(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + 
              y_2 * log(exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + 
              y_3 * log(exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3))))
    ce_list[[length(ce_list)+1]] = ce # recording the loss
    
    print(ce)
    
    d_ce_l2 = deriv(~ -(y_1 * log(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + 
                           y_2 * log(exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + 
                           y_3 * log(exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3)))), c('l2_1','l2_2','l2_3'))
    
    d_l2_1_par = deriv(~ x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1, c('x2a1', 'x2b1', 'x2c1', 'x2d1'))
    d_l2_1_l1 = deriv(~ x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1, c('l1_1', 'l1_2', 'l1_3'))
    
    d_l2_2_par = deriv(~ x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1, c('x2a2', 'x2b2', 'x2c2', 'x2d2'))
    d_l2_2_l1 = deriv(~ x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1, c('l1_1', 'l1_2', 'l1_3'))
    
    d_l2_3_par = deriv(~ x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1, c('x2a3', 'x2b3', 'x2c3', 'x2d3'))
    d_l2_3_l1 = deriv(~ x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1, c('l1_1', 'l1_2', 'l1_3'))
    
    d_l1_1_par = deriv(~ 1/(1+exp(-(x1a1 * in1a + x1b1 * in1b + x1c1 * in1c + x1d1 * in1d + x1e1 * 1))), c('x1a1', 'x1b1', 'x1c1', 'x1d1', 'x1e1'))
    d_l1_2_par = deriv(~ 1/(1+exp(-(x1a2 * in1a + x1b2 * in1b + x1c2 * in1c + x1d2 * in1d + x1e2 * 1))), c('x1a2', 'x1b2', 'x1c2', 'x1d2', 'x1e2'))
    d_l1_3_par = deriv(~ 1/(1+exp(-(x1a3 * in1a + x1b3 * in1b + x1c3 * in1c + x1d3 * in1d + x1e3 * 1))), c('x1a3', 'x1b3', 'x1c3', 'x1d3', 'x1e3'))
    
    grads_x2_1 = attributes(eval(d_l2_1_par))['gradient'][[1]][1,][c('x2a1', 'x2b1', 'x2c1', 'x2d1')] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1']
    grads_x2_2 = attributes(eval(d_l2_2_par))['gradient'][[1]][1,][c('x2a2', 'x2b2', 'x2c2', 'x2d2')] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2']
    grads_x2_3 = attributes(eval(d_l2_3_par))['gradient'][[1]][1,][c('x2a3', 'x2b3', 'x2c3', 'x2d3')] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3']
    
    grads_x1_1 = attributes(eval(d_l1_1_par))['gradient'][[1]][1,][c('x1a1', 'x1b1', 'x1c1', 'x1d1', 'x1e1')] * 
      (attributes(eval(d_l2_1_l1))['gradient'][[1]][1,]['l1_1'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] + 
         attributes(eval(d_l2_2_l1))['gradient'][[1]][1,]['l1_1'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] + 
         attributes(eval(d_l2_3_l1))['gradient'][[1]][1,]['l1_1'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3'])
    
    grads_x1_2 = attributes(eval(d_l1_2_par))['gradient'][[1]][1,][c('x1a2', 'x1b2', 'x1c2', 'x1d2', 'x1e2')] * 
      (attributes(eval(d_l2_1_l1))['gradient'][[1]][1,]['l1_2'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] + 
         attributes(eval(d_l2_2_l1))['gradient'][[1]][1,]['l1_2'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] + 
         attributes(eval(d_l2_3_l1))['gradient'][[1]][1,]['l1_2'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3'])
    
    grads_x1_3 = attributes(eval(d_l1_3_par))['gradient'][[1]][1,][c('x1a3', 'x1b3', 'x1c3', 'x1d3', 'x1e3')] * 
      (attributes(eval(d_l2_1_l1))['gradient'][[1]][1,]['l1_3'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] + 
         attributes(eval(d_l2_2_l1))['gradient'][[1]][1,]['l1_3'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] + 
         attributes(eval(d_l2_3_l1))['gradient'][[1]][1,]['l1_3'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3'])
    
    all_grads = c(grads_x1_1, grads_x1_2, grads_x1_3, grads_x2_1, grads_x2_2, grads_x2_3)
    
    ### using the Adam optimizer ###
    
    m = beta_1 * m + (1 - beta_1) * all_grads
    v = beta_2 * v + (1 - beta_2) * all_grads**2 # parameters m and v are updated based on the current loss gradients
    
    m_hat = m / (1 - beta_1**length(ce_list))
    v_hat = v / (1 - beta_2**length(ce_list)) # two new parameters, m_hat and v_hat, are computed from the other 
    # Adam parameters and from the number of optimization steps (which corresponds to the length of the list recording the loss)
    
    unlist(lapply(seq(1, length(all_grads)), function(i){
      eval(parse(text = paste0(names(all_grads)[i], " = ", names(all_grads)[i], " - lr * m_hat[", i, "] / (sqrt(v_hat[", i, "] + eps))")))
      eval(parse(text = paste0("assign('", names(all_grads)[i], "', ", names(all_grads)[i], ", .GlobalEnv)")))
    })) # using the Adam parameters in the updating of the parameters of the neural network. For details see Kingma & Ba (2017)
    
    ### end using the Adam optimizer ###
  }
}
Glorot, X. & Bengio, J. (2010): Understanding the difficulty of training deep feedforward neural networks. Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (PMLR) 9: 249-256
Kingma, D. P. & Ba, J. (2017): Adam: A method for stochastic optimization. arXiv:1412.6980
Ex Data, Scientia