// // // // // // // // // // // // // // // // // // // // // // // //
//
// The Backprop Algorithm
//
// comments in this code are C++ style "to end of line" comments
//
// Author: Robert Heckendorn, University of Idaho, 2014
//

//
// Mathematica Syntax/Semantics
//
//   vector in braces:  {1, 2, 3}
//   multiply does not require explict operator only adjacency:  3 5 = 3*5 = 15
//   scaler op vector applies scaler op to all elements of vector:
//          2 * {1, 2} -> {2, 4}
//          2 - {1, 2} -> {1, 0}
//   times can be indicated by just putting elements next to one another as in regular math
//   vector times vector is element by element times:  {1, 2}  {5, 7} -> {1, 2}*{5, 7} -> {5, 14}
//   matrices are a list of rows of the matrix where each row is a list: {{1, 2}, {3, 4}}
//   matrix multiply and dot product are all done with the dot (.) operator: {1, 2}.{5, 7} -> 19
//   functions defined with := operator
//   indexing vectors is done with [[index]]:    x[[1]] is the first element of x since
//         arrays begin with index 1 (not 0 as in C)
//   Transpose[{x}] for x a vector yeilds specifically a column vector form of x
//   Prepend[x, 666]  adds 666 to the front of x
//   Drop[x, 1]  removes the first element of x

// define the sigmoid function
s[x_]:=1/(1+E^(-4 * x));

// set the global learning rate: alpha  (eta in the book)
alpha = 0.1

// Training data:
//   an array of vectors to two elements: {x, t}
//      the x vector are the input values
//      the t vector are the values at the output nodes
//   for example this is the 2 bit identity function:
//       training = {
//       {{0, 0}, {0, 0}},
//       {{0, 1}, {0, 1}},
//       {{1, 0}, {1, 0}},
//       {{1, 1}, {1, 1}}}


// NOTES ON THE SIZES OF MATRICES BELOW
//
// inSize is the number of features used to classify an item.
//
// outSize is the number of outputs.  If there are 3 classes on output it
// is often 3 values that are either 0 or 1.
//
// hiddenSize is the number of hidden nodes.
//
// inputs are fed to the first sum/sigmoid hidden nodes via weighted
// edges v which are then fed to the output layer via weighted edges w.
//
// variable is rows         by columns
// -------- -- ----         -- -------
// classes  is 1            by outSize
// delta1   is 1            by hiddenSize+1
// delta2   is 1            by outSize
// features is 1            by inSize
// layer1   is 1            by hiddenSize+1  (the +1 comes from prepending bias)
// layer2   is 1            by outSize
// v        is inSize+1     by hiddenSize
// w        is hiddenSize+1 by outSize
// x        is 1            by inSize+1      (the +1 comes from prepending bias)
// y        is 1            by outSize
//


// Define the initWeights function
// Input: training data, number of nodes in hidden layer ignoring bias node
// Output: set global weights (matrices v and w)
initWeights[training0_, hiddenSize0_]:=
Module[{training=training0, hiddenSize = hiddenSize0, firstCase, x, t, inSize, outSize, weightRange},
    firstCase = training[[1]];      // look at the first training case to see the size of x and t
    x = firstCase[[1]];             // x is first of the pair
    t = firstCase[[2]];             // t is the second of the pair
    inSize=Length[x];               // number of inputs is length of x (ignoring bias)
    outSize = Length[t];            // number of outputs is length of t
    weightRange = 1/Sqrt[2];        //  range of initial weights is +/-weightRange
    // v is a (inSize+1) by hiddenSize matrix of random reals.
    // +1 accounts for extra bias node
    v = RandomReal[{-weightRange, weightRange}, {inSize+1, hiddenSize}];
    // w is a (hiddenSize+1) by outSize matrix of random reals.
    // +1 accounts for extra bias node
    w =RandomReal[{-weightRange, weightRange}, {hiddenSize+1, outSize}];
]


// evaluate the NN given the v and w weights and an input vector x
forward[x0_]:=
Module[{x=x0, layer1, layer2},
    x = Prepend[x, -1];             // add a bias input value
    layer1 = x.v;                   // compute the sum of inputs times weights
    layer1 = Map[s, layer1];        // apply sigmoid to each element in vector
    layer1 = Prepend[layer1, -1];    // add a bias value for hidden layer
    layer2 = layer1.w;               // compute the sum of hidden nodes times weights
    layer2 = Map[s, layer2];        // apply sigmoid
    layer2                          // return the answer (vector y in the book)
]


// adjust the weights in a NN using backprop
// requires an example x and t.
adjust[x0_, t0_]:=
Module[{x=x0, t=t0, layer1, layer2, delta1, delta2, y},
    // forward
    x = Prepend[x, -1];              // add a bias input value increasing size of x
    layer1 = x.v;                    // compute the sum of inputs times weights
    layer1 = Map[s, layer1] ;        // apply sigmoid
    layer1= Prepend[layer1, -1];     // add a bias value for hidden layer increasing size of layer1
    layer2= layer1.w;                // compute the sum of hidden nodes times weights
    layer2 = Map[s, layer2];         // apply sigmoid
    y = layer2;                      // the answer is y as in the book

    // push the error backward.  Uses layer1 and y data from forward run
    // Note: below (layer1 (1 - layer1)) is a regular multiply of vectors, not a matrix multiply
    // compute these two deltas before you adjust w and v!
    delta2 = (t - y) y (1 - y);       // delta error for weights to layer2
    delta1 =( layer1 (1 - layer1)) ( delta2 . Transpose[w]);  // delta error for weights to layer1

    // IMPORTANT: matrix multiply of column vector layer1 times row vector of delta2 gives
    // a matrix of size: Length[layer1] X Length[delta2]
    // adjust w by using learning rate alpha
    w = w + alpha * Transpose[{layer1}] . {delta2};

    // IMPORTANT: matrix multiply of column vector x times row vector of delta1 gives
    // a matrix of size: Length[x] X Length[delta1]
    // adjust v by using learning rate alpha
    delta1 = Drop[delta1, 1];        // ignore adjustment to hidden layer bias node by removing
                                     // first element.  Makes delta1 smaller.
    v = v + alpha * Transpose[{x}] . {delta1};

    // return the value of y for plotting purposes.
    // You could also return EuclidianDistance[y, t] which would plot the error
    y
]

// here is some training data
training = {
{{0, 0}, {0, 0}},
{{0, 1}, {0, 1}},
{{1, 0}, {1, 0}},
{{1, 1}, {1, 1}}}

// init the weights and assume a hidden layer of 3 nodes.
initWeights[training, 3];

// pass over the training data 5000 times in a random order
// applying the adjust function to create optimized matrices v and w.
//
Do[Map[Apply[adjust, #]&,  RandomSample[training]], {5000}]

// call the function based on the learned weights and given a vector x
// that you want to evaluate.
forward[x]