SAS Program Code Comparing Discriminant Analysis Estimates

The data is the iris data from Fisher 1936 that is from the SAS/STAT User’s Guide Version 8 Example 25.1: DISCRIM Procedure chapter. This is the SAS program code that compares the classification performance between the discriminant model with the neural network model. The neural network estimates are based on a GLIM general linear model, MLP, RBF and the NBF designs. A small macro step was written for each neural network classification design. Since the discriminant model consisted of more than two input variables, therefore, canonical discriminant analysis was performed to visually observe the separation between the target groups. Added SAS code was included to generate the various decision boundaries of the separate classification modeling designs.

options nocenter nodate nonumber;

proc format;

value specname

1 = 'SETOSA '

2 = 'VERSICOLOR'

3 = 'VIRGINICA ';

run;

title h=1.5 f=zapf j=c

'Discriminant Analysis of Fisher (1936) Iris Data';

title2 h=1.2 f=zapf j=c

'Classifying Species by Sepal, Petal Length and Width';

data iris;

input sepallen sepalwid petallen petalwid species @@;

format species specname.;

label sepallen='Sepal Length in mm.'

sepalwid='Sepal Width in mm.'

petallen='Petal Length in mm.'

petalwid='Petal Width in mm.';

cards;

50 33 14 02 1 64 28 56 22 3 65 28 46 15 2 67 31 56 24 3

...

63 33 60 25 3 53 37 15 02 1

;

proc sort data=iris out=sortout;

by species;

run;

%macro boxplots(vars=, titles=);

title4 h=1.2 f=zapf j=c

“Analysis of Variance between Species and &titles”;

* Testing group means with an equal sample size within each target group;

proc anova data=iris;

class species;

model &vars=species;

means species / tukey;

run;

title2 h=1.4 f=zapf j=c

'Classifying Species by Sepal, Petal Length and Width';

title4 h=1.4 f=zapf j=c "Box Plot between Species and &titles";

axis1 label=(c=black f=swiss h=1.4 j=c 'Species')

value=(h=1.4)

minor=none;

axis2 label=(c=black f=swiss h=1.4 j=r a=90 "&titles")

value=(h=1.4)

order = (0 to 30 by 5)

minor=none;

* Box Plot of petal width by species;

proc boxplot data=sortout;

plot &vars*species / haxis=axis1

vaxis=axis2;

footnote f=swissl h=1.2 j=l

"Source: SAS User's Guide: Statistics The DISCRIM Procedure";

run;

%mend;

%boxplots(vars=sepallen, titles=Sepal Length);

%boxplots(vars=sepalwid, titles=Sepal Width);

%boxplots(vars=petallen, titles=Petal Length);

%boxplots(vars=petalwid, titles=Petal Width);

footnote;

title2 h=1.2 f=zapf j=c

'Classifying Species by Sepal, Petal Length and Width';

title4 h=1.2 f=zapf j=c

'Using Normal Density Estimates with Unequal Variance';

* Discriminant analysis procedure specifying a test of homogeneity of

within covariance matrices test to determine if there is equal

variability across the different target groups and assuming equal prior

probabilities within each target level;

proc discrim data=iris simple wcov pcov pcorr listerr pool=test

out=discrimout;

class species;

var sepallen sepalwid petallen petalwid;

run;

* Confusion matrix to evaluate the accuracy of the quadratic discriminant

function i.e. frequency table of the actual by the predicted classes;

proc tabulate data=discrimout format=12.0;

class species _INTO_;

table species all='Total', _INTO_ all='Total' / misstext='0' rts=12;

label species = 'Species'

_INTO_ = 'Into: Species';

keylabel n = ' ';

run;

/* Data Mining SAS procedure */

proc dmdb data=iris out=dmddata dmdbcat=dmcdata;

class species;

var sepallen sepalwid petallen petalwid;

run;

* SAS Neural network procedure using the Logistic Discriminant Analysis;

proc neural data=dmddata dmdbcat=dmcdata; * Read data mining data set;

input sepallen sepalwid petallen petalwid / level=int;

* input interval variable;

target species / level=nom error=mbernoulli; * nominal-valued target

variable using multiple Bernoulli error function;

archi glim; * GLIM logistic discrimination design;

train tech = quanew; * quasi-Newton optimization technique;

score data=iris out=pred nodmdb; * Create a score output data set;

run;

title4 h=1.2 f=zapf j=c 'Neural Network Architecture';

title5 h=1.2 f=zapf j=c 'GLIM: General Linear Regression';

* Confusion matrix to evaluate the accuracy of the general linear

discriminant function i.e. frequency table of the actual by the

predicted target levels;

proc tabulate data=pred format=12.0;

class species i_species;

table species all='Total', i_species all='Total' / misstext='0' rts=12;

label species = 'Species'

i_species = 'Into: Species';

keylabel n = ' ';

run;

* Macro to perform classification modeling for each network architecture;

%macro neuralnets(arch=,titles=);

proc neural data=dmddata dmdbcat=dmcdata; * Read data mining data set;

input sepallen sepalwid petallen petalwid / level=int;

* input interval variable;

target species / level=nom error=mbernoulli; * nominal-valued target

variable using multiple Bernoulli error function;

archi &arch hidden=5;

train tech = quanew; * quasi-Newton optimization technique;

score data=iris out=pred nodmdb; * Create a score output data set;

run;

title5 h=1.2 f=zapf j=c "&titles";

* Confusion matrix to evaluate the accuracy of the classification model;

proc tabulate data=pred format=12.0;

class species i_species;

table species all='Total', i_species all='Total' / misstext='0' rts=12;

label species = 'Species'

i_species = 'Into: Species';

keylabel n = ' ';

run;

%mend;

%neuralnets(arch=mlp, titles=MLP: Multiple Layer Preceptron);

%neuralnets(arch=orbfun,titles=ORBFUN:

Ordinary Radial Basis Function with Unequal Widths);

%neuralnets(arch=nrbfun,titles=NRBFUN:

Normalized Radial Basis Function with Unequal Heights and Widths);

%neuralnets(arch=nrbfeq,titles=NRBFEQ:

Normalized Radial Basis Function with Equal Widths and Heights);

%neuralnets(arch=nrbfeh,titles=NRBFEH:

Normalized Radial Basis Function with Equal Heights and Unequal Widths);

title2 h=1.2 f=zapf j=c

'Classifying Iris Wild Flower Species by Sepal Length and Width,

Petal Length and Width';

title4 h=1.2 f=zapf j=c 'Canonical Discriminant Analysis';

* Canonical discriminant analysis of iris data set that calculates the

smallest number of linear combinations of the input variables in order

to maximize the separation of the target group means and to visualize

the relationship between the target groups and the canonical components.

Also, since there are three different target groups therefore, there are

two separate canonical discriminant functions;

proc candisc data=iris out=disc all;

class species;

var sepallen sepalwid petallen petalwid;

run;

axis1 label=(c=black f=swiss h=1 j=c

'First Canonical Discriminant Score')

value=(h=1)

width=3

minor=none;

axis2 label=(c=black f=swiss h=1 j=r a=90

'Second Canonical Discriminant Score')

value=(h=1)

width=3

minor=none;

legend1 label=none

value=(font=zapf)

across=5;

symbol1 i=none value=dot color=black height=1 ;

symbol2 i=none value=circle color=black height=1 ;

symbol3 i=none value=star color=black height=1 ;

proc gplot data=disc;

plot can1*can2=species / haxis=axis1

vaxis=axis2

legend=legend1;

footnote h=1.2 f=swissl j=l

"Source: SAS Users's Guide: Statistics The DISCRIM Procedure";

run;

* The following SAS code produces the decision boundaries for the various

neural network architectures between petal width and petal length across

the three separate iris wild flower types;

* Data step to produce a fine grid of both petal length and petal width

that is needed in the contour plot;

data grid;

do petalwid = 0 to 30;

do petallen = 0 to 70;

output;

end;

end;

run;

* Annotate data set to assign both the symbol and color to each wild

flower species;

data anno;

set iris;

x = petalwid;

y = petallen;

function = 'symbol';

xsys='2'; ysys='2';

size = 1;

if species = 1 then do;

text = 'DOT';

color = 'black';

end;

else if species = 2 then do;

text = 'o';

color = 'black';

style = 'zapf';

end;

else if species = 3 then do;

text = 'x';

color = 'black';

style = 'zapf';

end;

when = 'B';

run;

* Create analysis file;

data final;

set iris

grid;

run;

/* Data Mining SAS procedure */

proc dmdb data=final dmdbcat=dmcdata out=dmddata;

class species;

var petallen petalwid;

run;

%macro neuralnets(design=,title=,title2=,discrim=);

/* SAS Neural network procedure */

proc neural data=dmddata dmdbcat=dmcdata; * Read data mining data set;

input petallen

petalwid / level=int; * input interval variable;

target species / level=nom; * Nominal-valued target variable;

%if &design NE glim %then %do;

archi &design hidden=5; * design with a five hidden layer units;

%end;

%else %do;

archi &design; * GLIM design with no hidden layer units;

%end;

train tech = quanew; * quasi-Newton optimization technique;

score data=iris out=pred nodmdb; * Create a score output data sets;

score data=grid out=gridout nodmdb;

run;

%if %UPCASE(&discrim) eq YES %then %do;

* Discriminate analysis procedure specifying a test of homogeneity of

within covariance matrices test to determine if there is equal

variability within the covariance matrices and assuming equal prior

probabilities within each target level;

proc discrim data=final pool=test out=gridout;

class species;

var petallen petalwid;

run;

* Decision boundaries based on the posterior probabilities between the

three wild flower groups for quadratic discriminant model;

data gridout;

set gridout;

d1 = SETOSA - MAX(VERSICOLOR, VIRGINICA);

d7 = VERSICOLOR - MAX(SETOSA, VIRGINICA);

run;

%end;

%else %do;

* Decision boundaries based on the posterior probabilities between the

three wild flower groups for the neural network classification designs;

data gridout;

set gridout;

d1 = p_speciesSETOSA - MAX(p_speciesVERSICOLOR, p_speciesVIRGINICA);

d7 = p_speciesVERSICOLOR - MAX(p_speciesSETOSA, p_speciesVIRGINICA);

run;

%end;

goptions reset=all device=win;

title h=1.5 f=zapf j=c

'Discriminant Analysis of Fisher (1936) Iris Data';

title2 h=1.2 f=zapf j=c "Neural Network Architecture: &title";

title3 h=1.2 f=zapf j=c "&title2";

axis1 label=(c=black f=swiss h=1.1 j=c 'Petal Width')

value=(h=1 f=swiss)

width=3

minor=none width=3

order=(0 to 30 by 10);

axis2 label=(c=black f=swiss h=1.1 j=r a=90 'Petal Length')

value=(h=1 f=swiss)

width=3

minor=none width=3

offset=(1)

order=(0 to 70 by 10);

proc gcontour data=gridout gout=dbplot;

plot petallen * petalwid = d1 / levels=0 anno=anno haxis=axis1 vaxis=axis2

nolegend caxis=bl name = '&design.1';

plot petallen * petalwid = d7 / levels=0 anno=anno haxis=axis1 vaxis=axis2

nolegend caxis=bl name = '&design.2';

footnote1 h=1.1 f=swissl j=l "Setosa: Dot";

footnote2 h=1.1 f=swissl j=l "Versicolor: Circle";

footnote3 h=1.1 f=swissl j=l "Virginica: Cross";

footnote5 h=1.1 f=swissl j=l

"Source: SAS User's Guide: Statistics, The DISCRIM Procedure";

run;

quit;

goptions reset=all;

proc greplay nofs tc=sashelp.templt igout=dbplot;

template irisplot;

treplay 1:'&design.1' 1:'&design.2';

run;

%mend;

%neuralnets(design=%str( ),

title=Quadradic Discriminate Function,

title2=%str( ), discrim=YES);

%neuralnets(design=glim,

title=GLIM: General Linear Regression,

title2=%str( ), discrim=%str( ));

%neuralnets(design=mlp,

title=MLP: Multiple Layer Preceptron,

title2=Based on Five Hidden Layer Units, discrim=%str( ));

%neuralnets(design=orbfun,

title=ORBFUN: Ordinary Radial Basis Function with Unequal

Widths,

title2=Based on Five Hidden Layer Units, discrim=%str( ));

%neuralnets(design=nrbfun,

title=NRBFUN: Normalized Radial Basis Function with Unequal

Heights and Widths,

title2=Based on Five Hidden Layer Units, discrim=%str( ));

%neuralnets(design=nrbfeq,

title=NRBFEQ: Normalized Radial Basis Function with Equal

Widths and Heights,

title2=Based on Five Hidden Layer Units, discrim=%str( ));

%neuralnets(design=nrbfeh,

title=NRBFEH: Normalized Radial Basis Function with Equal

Heights and Unequal Widths,

title2=Based on Five Hidden Layer Units, discrim=%str( ));

quit;