Optimal Brain Surgeon Algorithm

ECE 539 Final Project

Mark Slosarek

9014829809

Optimal Brain Surgeon Algorithm

Mark Slosarek

Introduction:

For this project I am performing the optimal brain surgery algorithm. This algorithm is a pruning algorithm that reduces the connection between layers thus making it more efficient. A pruned network has several advantages. First, it needs smaller storage space. Second, it spends shorter time in calculation.Reducing a neural network's complexity improves the ability of the network to generalize future examples.

Work Performed:

The Optimal Brain Surgery Algorithm is based on the MLP Feed-forward Model similar to that described in Lecture 9MLP (I): Feed-forward Model of professor Hu’s notes.

Figure 1: A Three Layer Feed-Forward Multi-layer Perceptron

In implementing the Optimal Brain Surgeon Algorithm, I implemented six steps as described in Neural Networks: A comprehensive Foundation by Simon Haykin.

  1. Train the given multilayer perceptron to minimum mean-square error
  2. Use the multilayer perceptron model with two hidden layers and one output neutron (three layer feed-forward multilayer perceptron) to compute the vector

    where is the input-output mapping realized by the multilayer perceptron with an overall weight vector , and is the input vector.
  3. Use the recursion

    to calculate the inverse Hessian
  4. Find the that corresponds to the smallest saliency:

    where is the element of . If the saliency is much smaller than the mean square, , then delete synaptic weight , and proceed to step 4. Otherwise, go to 5.
  5. Update all the synaptic weights in the network by applying the adjustment:

    Go to step 2.
  6. Stop the computation when no more weights can be deleted from the network without a large increase in the mean-square error.

To truly implement this solution many different loops had to be implemented. These loops ended up being very cost effective on the computer and so I implemented a weight removal system that applied an algorithm that removed more weights. Although this was more efficient it also removed a few to many weights and my code sometimes removes too many weights. All of the code that I have used is located in the appendix of this document, with the exception of some auxiliary codes that are used by Professor Hu’s code. To run tests on the wine date, just run testsets.m.

Results:

I applied my code to the wine sets that we used in the homework assignments and my results were across the board. Most results were very similar to that produced by the original MLP program. The difference is that I was able to reduce the number of weights by about 60% in most cases. In such small cases the as these training sets, the computational time would be minimal, but in larger applications the results that would be gained from this reduction in weights could result in the difference between correct results and results that are skewed by too much data.

I am not a great programmer, so some of the code could be optimized, as well as creating a better procedure for removing the weights. I could not write the proper code that operated exactly as the algorithm is described in the book. A better programmer would have been able to write efficient code that would have performed better than my code.

References:

  • Hassibi &Stork,Second order derivatives for network pruning: Optimal Brain Surgeon. S. J. Hanson, J. D. Cowan and C. L. Giles, Advances in Neural Information Processing Systems 5, San Mateo, CA: Morgan Kaufmann, 1993, pp164-171
  • Haykin, Simon, Neural Networks: A Comprehensive Foundation, Prentice Hall Inc,1999, pp 222-226

Appendix:

testsets.m

clear all;

disp ('1. Train with wine1 and wine2, test with wine3')

disp ('2. Train with wine1 and wine3, test with wine2')

disp ('3. Train with wine2 and wine3, test with wine1')

disp ('------')

option=input('Enter which set you would like to test: ');

load wine1;

load wine2;

load wine3;

wine12 = [wine1;wine2];

wine13 = [wine1;wine3];

wine23 = [wine2;wine3];

if option == 1

wbest=opt_brain(13,10,3,wine12,0,'wine3',1,0,1000);

elseif option == 2

wbest=opt_brain(13,10,3,wine13,0,'wine2',1,0,1000);

elseif option == 3

wbest=opt_brain(13,10,3,wine23,0,'wine1',1,0,1000);

end

optimal_brain.bp.m

% (C) copyright 2001 by Yu Hen Hu

% modified to run off of the variables configured by initialize_variables.m

% BP iterations begins

while not_converged==1,

% start a new epoch

% Randomly select K training samples from the training set.

[train,ptr,train0]=rsample(train0,K,Kr,ptr); % train is K by M+N

z{1}=(train(:,1:M))'; % input sample matrix M by K

d=train(:,M+1:MN)'; % corresponding target value N by K

% Feed-forward phase, compute sum of square errors

for l=2:L, % the l-th layer

u{l}=w{l}*[ones(1,K);z{l-1}]; % u{l} is n(l) by K

z{l}=actfun(u{l},atype(l));

end

error=d-z{L}; % error is N by K

E(t)=sum(sum(error.*error));

% Error back-propagation phase, compute delta error

delta{L}=actfunp(u{L},1).*error; % N (=n(L)) by K

if L>2,

for l=L-1:-1:2,

delta{l}=(w{l+1}(:,2:layers(l)+1))'*delta{l+1}.*actfunp(u{l},atype(l));

end

end

% update the weight matrix using gradient, momentum and random perturbation

for l=2:L,

dw{l}=alpha*delta{l}*[ones(1,K);z{l-1}]'+...

mo*dw{l}+randn(size(w{l}))*0.005;

w{l}=w{l}+dw{l};

end

% Test convergence to see if the convergence condition is satisfied,

cvgtest;

t = t + 1; % increment epoch count

end % while loop

% training results

disp('Final training results:')

if classreg==0,

[Cmat,crate]=bptest(wbest,tune,atype),

elseif classreg==1,

SS=bptestap(wbest,tune,atype),

end

% testing results

disp('Apply trained MLP network to the testing data. The results are: ');

if classreg==0,

[Cmat1,crate1]=bptest(wbest,test0,atype),

elseif classreg==1,

SS1=bptestap(wbest,test0,atype),

end

opt_brain.m

function wbest=opt_brain(input_nodes,hidden_nodes,output_nodes,train,train_is_file,test,test_is_file,classreg,nepoch)

% wbest=opt_brain(input_nodes,hidden_nodes,output_nodes,train,train_is_file,test,test_is_file,classreg,nepoch)

% input_nodes is input nodes

% hidden_nodes is hidden nodes

% output_nodes is output nodes

% train is the training set

% train_is_file, 0 to use input, 1 to load values from file

% test is the testing set

% test_is_file, 0 to use input, 1 to load values from file

% classreg 0 for pattern classification, 1 for approximation

% nepoch is the maximum number of epochs to do

% wbest has the weighted matrices after the OBS algorithm has finished

% check to see of the training and testing variables are files, if they

% are files, load them in to their respective variables, else just load

% the contents

if train_is_file==1

eval(['load ' train]);

train0=eval(train);

else

train0=train;

end

if test_is_file==1

eval(['load ' test]);

test0=eval(test);

else

test0=test;

end

% initialize values

% create a layers variable made up of the input_nodes, hidden_nodes, and

% output_nodes

layers = [input_nodes;hidden_nodes;output_nodes];

initialize_variables;

% run the bp MLP algorithm to train the system

optimal_brain_bp;

disp('Training set has been completed, pless any key')

disp('to perform Optimal Brain Surgery Algorithm')

pause;

% Perform Optimal Brain Surgery Algorithm

length_cost=layers(2:end)'*layers(1:end-1)+sum(layers(2:end));

cost=zeros(length_cost,1);

k=1;

if classreg==0

% clasification

opt_brain_class

elseif classreg==1

% approximation

opt_brain_approx

end
opt_brain_approx.m

% Find the cost value

for l=2:L

for I=1:layers(l)

for J=1:(layers(l-1)+1)

wnew(k)=wbest{l}(I,J);

temp=wbest;

temp{l}(I,J)=0;

cost(k)=bptestap(temp,tune,atype);

k=k+1;

end

end

end

% Approximate dF/dw

cai=S-cost;

% Find where differences are small

index=find(cai<.000001/length(cai));

if length(index>ceil(.95*length(wnew)))

index=index(nonzeros(round( length(index)*rand(ceil(.95*length(wnew)),1) )));

end

% Approximate H^-1

c2=cai*cai';

Hin=(1/.01)*eye(length(cai));

for T=1:10

Hin=Hin-(1/(1+cai'*Hin*cai))*Hin*c2*Hin;

end

% Update of the weights

wnew = wnew-(Hin(:,index)*(wnew(index)'./diag(Hin(index,index))))';

% Transform wnew as required by bptest

k=1;

for l=2:L

temp=[];

% Convert wnew into L-1 matrices of proper size

for I=1:layers(l)

temp=[temp;wnew(k:(k+layers(l-1)))];

k=1+(k+layers(l-1));

end

wbest{l}=temp;

end

S=bptestap(wbest,tune,atype)

disp('Testing Data with new weights')

S1=bptestap(wbest,test0,atype)

opt_brain_class

% Find the cost value

for l=2:L

for I=1:layers(l)

for J=1:(layers(l-1)+1)

wnew(k)=wbest{l}(I,J);

temp=wbest;

temp{l}(I,J)=0;

[Cmat,cost(k)]=bptest(temp,tune,atype);

k=k+1;

end

end

end

% Approximate dF/dw

cai=cost-crate;

% Find where differences are small

index=find(cai<.1);

if length(index>ceil(.95*length(wnew)))

index=index(nonzeros(round( length(index)*rand(ceil(.95*length(wnew)),1) )));

end

% Approximate H^-1

c2=cai*cai';

Hin=(1/.01)*eye(length(cai));

for T=1:10

Hin=Hin-(1/(1+cai'*Hin*cai))*Hin*c2*Hin;

end

% Update of the weights

wnew = wnew-(Hin(:,index)*(wnew(index)'./diag(Hin(index,index))))';

% Transform wnew as required by bptest

k=1;

for l=2:L

temp=[];

% Convert wnew into L-1 matrices of proper size

for I=1:layers(l)

temp=[temp;wnew(k:(k+layers(l-1)))];

k=1+(k+layers(l-1));

end

wbest{l}=temp;

end

[Cmat,crate]=bptest(wbest,tune,atype)

disp('Testing Data with new weights')

[Cmat1,crate1]=bptest(wbest,test0,atype)

1 of 11