**\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\***********

This code will execute the second of two discretization processes.

Here we will divide the variable into equal frequencies. Each

value range will then be assigned the avg of the dep variable and

ttests will be run to determine significant groupings. After significance has been

determined and insignificant differences have been collapsed, ordinal and oddsratio

values will be assigned as above.

**\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\***********;

*******************PREDICTOR 1 - AGE *********************************;

Proc contents data = jlp.modelingfile1;

run;

%let d1=jlp.modelingfile1;

*Here we are creating 10 groups of approx equal frequencies;

*Note that there is a new variable created

called "Rank" - which is 0-9.

This Proc does not generate output;

Proc Rank data=&d1 out=ranked groups = 10 ties = high;

ranks rank;

var age;

Run;

Proc Freq data=ranked;

Tables RANK;

Run;

*Here we are just checking to make sure that everything worked as expected;

*Again, no output created;

Proc Summary data=ranked missing mean std min max;

class rank;

Var age GOODBAD;

Output out = summaryrank mean = avg_indpavg_depstd= std_indpstd_dep min=min_indpmin_dep

max=max_indpmax_dep;

Run;

Proc print data=summaryrank;

run;

*Note that the file reference here will be whatever was output from the Proc Summary;

**DO NOT CHANGE...EVER!;

proc sort data=summaryrank; by rank; run;

%let pval_col=0.15;

Data summaryrank;

Set summaryrank;

var="age";

If _TYPE_ = 1;

rename _freq_ = numobs;

Run;

data temp;

setsummaryrank;

byvar;

retain _finit _rnk1 _rnk2 _freq1 _freq2 _aindp1 _aindp2

_mindp1 _mindp2 _mxindp1 _mxindp2 _adep1 _adep2

_sdep1 _sdep2 _sdenom1 _sdenom2;

*use _finit to flag when first initialization occurs;

*First initialization is when the first non-missing level is;

*encountered. it is from this point forward that the t-tests;

*take place, since missing levels are automatically outputted;

*as their own level;

iffirst.var then _finit=0;

if rank<=.Z then output;

else do;

if _finit=0 then do;

_finit=1;

_rnk1=rank;

_freq1=numobs;

_aindp1=numobs*avg_indp;

_mindp1=min_indp;

_mxindp1=max_indp;

_adep1=numobs*avg_dep;

ifstd_dep=. then _sdep1=0;

else _sdep1=(numobs-1)*std_dep*std_dep;

_sdenom1=numobs-1;

end;

else do;

_w1=(_sdep1/_sdenom1)/_freq1;

_w2=(std_dep**2)/numobs;

_df=((_w1+_w2)**2)/((_w1**2)/(_freq1-1)+(_w2**2)/(numobs-1));

_t=abs(_adep1/_freq1-avg_dep)/sqrt(_w1+_w2);

pvalue=(1-probt(_t,round(_df,1)))*2;

* If t-test is significant, then output cumulated variables;

* Otherwise continue cumulating;

ifpvalue<=&pval_col then do;

* Store current row;

_rnk2=rank;

_freq2=numobs;

_aindp2=numobs*avg_indp;

_mindp2=min_indp;

_mxindp2=max_indp;

_adep2=numobs*avg_dep;

ifstd_dep=. then _sdep2=0;

else _sdep2=(numobs-1)*std_dep*std_dep;

_sdenom2=numobs-1;

* Switch cumulated variables with current row and output;

rank=_rnk1;

numobs=_freq1;

avg_indp=_aindp1/_freq1;

min_indp=_mindp1;

max_indp=_mxindp1;

avg_dep=_adep1/_freq1;

std_dep=sqrt(_sdep1/_sdenom1);

output;

* Set first variables to current row;

_rnk1=_rnk2;

_freq1=_freq2;

_aindp1=_aindp2;

_mindp1=_mindp2;

_mxindp1=_mxindp2;

_adep1=_adep2;

_sdep1=_sdep2;

_sdenom1=_sdenom2;

end;

else do;

_rnk1=rank;

_freq1=_freq1+numobs;

_aindp1=_aindp1+numobs*avg_indp;

_mxindp1=max_indp;

_adep1=_adep1+numobs*avg_dep;

ifstd_dep ne . then _sdep1=_sdep1+(numobs-1)*std_dep*std_dep;

_sdenom1=_sdenom1+numobs-1;

end;

*end;

* If last row for current variable, then output;

iflast.var then do;

rank=_rnk1;

numobs=_freq1;

avg_indp=_aindp1/_freq1;

min_indp=_mindp1;

max_indp=_mxindp1;

avg_dep=_adep1/_freq1;

std_dep=sqrt(_sdep1/_sdenom1);

output;

end;

end;

end;

drop _finit _rnk1 _rnk2 _freq1 _freq2 _aindp1 _aindp2

_mindp1 _mindp2 _mxindp1 _mxindp2 _adep1 _adep2

_sdep1 _sdep2 _sdenom1 _sdenom2 _w1 _w2 _df _t /*pvalue*/;

run;

***********End of secret code. You can resume normal procedures.;

Proc Print data=temp;

Run;

Proc sort data=temp;

by rank;

Run;

symbol1 v= square color = black i=join;

Proc gplot data = temp;

plotavg_dep*rank;

run;

quit;

Data ranked1;

Set ranked;

if rank = 0 then rank = 0;

else if rank = 1 then rank = 2;

else if rank = 2 then rank = 2;

else if rank = 3 then rank = 3;

else if rank = 4 then rank = 4;

else if rank = 5 then rank = 5;

else if rank = 6 then rank = 6;

else if rank = 7 then rank = 7;

else if rank = 8 then rank = 9;

else if rank = 9 then rank = 9;

else rank = rank;

Run;

Proc sort data=ranked1;

by rank;

Run;

Proc sort data=temp;

by rank;

Run;

Proc freq data=ranked1;

tables rank;

run;

Data disc2age;

Merge ranked1 temp;

by rank;

Run;

*****At this point, you can assign the ranks as 1, 2, 3...;

Data disc2age1;

set disc2age;

if rank = 0 then ordeqage = 1;

else if rank = 2 then ordeqage = 2;

else if rank = 3 then ordeqage = 3;

else if rank = 4 then ordeqage = 4;

else if rank = 5 then ordeqage = 5;

else if rank = 6 then ordeqage = 6;

else if rank = 7 then ordeqage = 7;

else if rank = 9 then ordeqage = 8;

else rank = rank;

odseqage = (avg_dep)/(1-avg_dep);

lodseqage = log ((avg_dep)/(1-avg_dep));

run;

*Note: at this point, there is ALOT of garbage in the file that needs to be removed...;

Data disc2age1 (drop = _TYPE_ avg_depavg_indpmax_depmax_indpmin_depmin_indpnumobs

pvalue rank std_depstd_indpvar);

set disc2age1;

Run;

Proc freq data=disc2age1;

table ordeqageODSEQageLODSEQage;

Run;

Proc Means data = disc2age1 ;

classordeqage;

var age;

run;