Programming the backpropagation algorithm from scratch
set.seed(0) inds = sample(seq(1,nrow(iris),1), nrow(iris), replace = F) iris = iris[inds,] my_iris = iris[,1:4] for(i in 1:4){ my_iris[,i] = (my_iris[,i] - mean(my_iris[,i])) / sd(my_iris[,i]) } y_mat = matrix(nrow = nrow(iris), ncol = length(unique(iris$Species))) y_mat[,] = 0 y_mat[iris$Species == unique(iris$Species)[1],1] = 1 y_mat[iris$Species == unique(iris$Species)[2],2] = 1 y_mat[iris$Species == unique(iris$Species)[3],3] = 1 ### sampling from the Glorot-uniform distribution (Glorot & Bengio, 2010) ### lim_l1 = sqrt(6/(4+3)) # range for parameter values of the hidden layer are given by 6 divided by the number of input dimensions (4) added to the number of hidden nodes (3) lim_l2 = sqrt(6/(3+3)) # range for parameter values of the output layer are given by 6 divided by the number of hidden nodes (4) added to the number of output nodes (3) x1a1 = runif(1, min = -lim_l1, max = lim_l1) # parameter is sampled from a uniform distribution defined by the limits calculated above x1b1 = runif(1, min = -lim_l1, max = lim_l1) x1c1 = runif(1, min = -lim_l1, max = lim_l1) x1d1 = runif(1, min = -lim_l1, max = lim_l1) x1e1 = runif(1, min = -lim_l1, max = lim_l1) x1a2 = runif(1, min = -lim_l1, max = lim_l1) x1b2 = runif(1, min = -lim_l1, max = lim_l1) x1c2 = runif(1, min = -lim_l1, max = lim_l1) x1d2 = runif(1, min = -lim_l1, max = lim_l1) x1e2 = runif(1, min = -lim_l1, max = lim_l1) x1a3 = runif(1, min = -lim_l1, max = lim_l1) x1b3 = runif(1, min = -lim_l1, max = lim_l1) x1c3 = runif(1, min = -lim_l1, max = lim_l1) x1d3 = runif(1, min = -lim_l1, max = lim_l1) x1e3 = runif(1, min = -lim_l1, max = lim_l1) x2a1 = runif(1, min = -lim_l2, max = lim_l2) x2b1 = runif(1, min = -lim_l2, max = lim_l2) x2c1 = runif(1, min = -lim_l2, max = lim_l2) x2d1 = runif(1, min = -lim_l2, max = lim_l2) x2a2 = runif(1, min = -lim_l2, max = lim_l2) x2b2 = runif(1, min = -lim_l2, max = lim_l2) x2c2 = runif(1, min = -lim_l2, max = lim_l2) x2d2 = runif(1, min = -lim_l2, max = lim_l2) x2a3 = runif(1, min = -lim_l2, max = lim_l2) x2b3 = runif(1, min = -lim_l2, max = lim_l2) x2c3 = runif(1, min = -lim_l2, max = lim_l2) x2d3 = runif(1, min = -lim_l2, max = lim_l2) ### end sampling from the Glorot-uniform distribution ### ### initialization of the Adam optimizer ### lr = 1e-3 # a lower learning rate leads to better performance here beta_1 = sample(seq(0,1,length.out = 1000), 1, replace = F) beta_2 = sample(seq(0,1,length.out = 1000), 1, replace = F) eps = 1e-5 m = 0 v = 0 # beta_1, beta_2, eps, m and v are all parameters of the Adam optimizer. For more details, see Kingma & Ba (2017) ### end initialization of the Adam optimizer ### ce_list = vector('list') # initializing list recording the loss over optimization steps for(k in 1:50){ for(j in 1:nrow(my_iris)){ in1a = my_iris[j,1] in1b = my_iris[j,2] in1c = my_iris[j,3] in1d = my_iris[j,4] y_1 = y_mat[j,1] y_2 = y_mat[j,2] y_3 = y_mat[j,3] l1_1 = 1/(1+exp(-(x1a1 * in1a + x1b1 * in1b + x1c1 * in1c + x1d1 * in1d + x1e1 * 1))) l1_2 = 1/(1+exp(-(x1a2 * in1a + x1b2 * in1b + x1c2 * in1c + x1d2 * in1d + x1e2 * 1))) l1_3 = 1/(1+exp(-(x1a3 * in1a + x1b3 * in1b + x1c3 * in1c + x1d3 * in1d + x1e3 * 1))) l2_1 = x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1 l2_2 = x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1 l2_3 = x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1 pred = which(c(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3)), exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3)), exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) == max(c(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3)), exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3)), exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3))))) ce = -(y_1 * log(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + y_2 * log(exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + y_3 * log(exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3)))) ce_list[[length(ce_list)+1]] = ce # recording the loss print(ce) d_ce_l2 = deriv(~ -(y_1 * log(exp(l2_1) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + y_2 * log(exp(l2_2) / (exp(l2_1) + exp(l2_2) + exp(l2_3))) + y_3 * log(exp(l2_3) / (exp(l2_1) + exp(l2_2) + exp(l2_3)))), c('l2_1','l2_2','l2_3')) d_l2_1_par = deriv(~ x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1, c('x2a1', 'x2b1', 'x2c1', 'x2d1')) d_l2_1_l1 = deriv(~ x2a1 * l1_1 + x2b1 * l1_2 + x2c1 * l1_3 + x2d1 * 1, c('l1_1', 'l1_2', 'l1_3')) d_l2_2_par = deriv(~ x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1, c('x2a2', 'x2b2', 'x2c2', 'x2d2')) d_l2_2_l1 = deriv(~ x2a2 * l1_1 + x2b2 * l1_2 + x2c2 * l1_3 + x2d2 * 1, c('l1_1', 'l1_2', 'l1_3')) d_l2_3_par = deriv(~ x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1, c('x2a3', 'x2b3', 'x2c3', 'x2d3')) d_l2_3_l1 = deriv(~ x2a3 * l1_1 + x2b3 * l1_2 + x2c3 * l1_3 + x2d3 * 1, c('l1_1', 'l1_2', 'l1_3')) d_l1_1_par = deriv(~ 1/(1+exp(-(x1a1 * in1a + x1b1 * in1b + x1c1 * in1c + x1d1 * in1d + x1e1 * 1))), c('x1a1', 'x1b1', 'x1c1', 'x1d1', 'x1e1')) d_l1_2_par = deriv(~ 1/(1+exp(-(x1a2 * in1a + x1b2 * in1b + x1c2 * in1c + x1d2 * in1d + x1e2 * 1))), c('x1a2', 'x1b2', 'x1c2', 'x1d2', 'x1e2')) d_l1_3_par = deriv(~ 1/(1+exp(-(x1a3 * in1a + x1b3 * in1b + x1c3 * in1c + x1d3 * in1d + x1e3 * 1))), c('x1a3', 'x1b3', 'x1c3', 'x1d3', 'x1e3')) grads_x2_1 = attributes(eval(d_l2_1_par))['gradient'][[1]][1,][c('x2a1', 'x2b1', 'x2c1', 'x2d1')] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] grads_x2_2 = attributes(eval(d_l2_2_par))['gradient'][[1]][1,][c('x2a2', 'x2b2', 'x2c2', 'x2d2')] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] grads_x2_3 = attributes(eval(d_l2_3_par))['gradient'][[1]][1,][c('x2a3', 'x2b3', 'x2c3', 'x2d3')] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3'] grads_x1_1 = attributes(eval(d_l1_1_par))['gradient'][[1]][1,][c('x1a1', 'x1b1', 'x1c1', 'x1d1', 'x1e1')] * (attributes(eval(d_l2_1_l1))['gradient'][[1]][1,]['l1_1'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] + attributes(eval(d_l2_2_l1))['gradient'][[1]][1,]['l1_1'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] + attributes(eval(d_l2_3_l1))['gradient'][[1]][1,]['l1_1'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3']) grads_x1_2 = attributes(eval(d_l1_2_par))['gradient'][[1]][1,][c('x1a2', 'x1b2', 'x1c2', 'x1d2', 'x1e2')] * (attributes(eval(d_l2_1_l1))['gradient'][[1]][1,]['l1_2'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] + attributes(eval(d_l2_2_l1))['gradient'][[1]][1,]['l1_2'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] + attributes(eval(d_l2_3_l1))['gradient'][[1]][1,]['l1_2'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3']) grads_x1_3 = attributes(eval(d_l1_3_par))['gradient'][[1]][1,][c('x1a3', 'x1b3', 'x1c3', 'x1d3', 'x1e3')] * (attributes(eval(d_l2_1_l1))['gradient'][[1]][1,]['l1_3'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_1'] + attributes(eval(d_l2_2_l1))['gradient'][[1]][1,]['l1_3'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_2'] + attributes(eval(d_l2_3_l1))['gradient'][[1]][1,]['l1_3'] * attributes(eval(d_ce_l2))['gradient'][[1]][1,]['l2_3']) all_grads = c(grads_x1_1, grads_x1_2, grads_x1_3, grads_x2_1, grads_x2_2, grads_x2_3) ### using the Adam optimizer ### m = beta_1 * m + (1 - beta_1) * all_grads v = beta_2 * v + (1 - beta_2) * all_grads**2 # parameters m and v are updated based on the current loss gradients m_hat = m / (1 - beta_1**length(ce_list)) v_hat = v / (1 - beta_2**length(ce_list)) # two new parameters, m_hat and v_hat, are computed from the other # Adam parameters and from the number of optimization steps (which corresponds to the length of the list recording the loss) unlist(lapply(seq(1, length(all_grads)), function(i){ eval(parse(text = paste0(names(all_grads)[i], " = ", names(all_grads)[i], " - lr * m_hat[", i, "] / (sqrt(v_hat[", i, "] + eps))"))) eval(parse(text = paste0("assign('", names(all_grads)[i], "', ", names(all_grads)[i], ", .GlobalEnv)"))) })) # using the Adam parameters in the updating of the parameters of the neural network. For details see Kingma & Ba (2017) ### end using the Adam optimizer ### } }
Glorot, X. & Bengio, J. (2010): Understanding the difficulty of training deep feedforward neural networks. Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (PMLR) 9: 249-256
Kingma, D. P. & Ba, J. (2017): Adam: A method for stochastic optimization. arXiv:1412.6980