AutoReg

Sample code

For a better comprehension of the procedure, here you can find an example and its explanation.

Data is taken from the website UC Irvine Machine Learning Repository (a very important resource for people that need real data for a test). We used the dataset German Credit Data about good and bad creditors.
The purpose of these pages is to present the procedure, so we'll not explain in details the characteristics of data or its formats: for more information, please read the dedicated page.
Also, this example does not consider any cost matrix.

Part one - Loading and fast data analysis
First of all, we create the input sas table from data file.


filename stlog 'C:\ ... PATH ... \german.data';


data gcd;
 infile stlog;
 input a1 $ a2 a3 $ a4 $ a5 a6 $ a7 $ a8 a9 $ a10 $
	a11 a12 $ a13 a14 $ a15 $ a16 a17 $ a18 a19 $ a20 $ a21;
run;
/*
NOTE: 1000 records were read from the infile STLOG.
      The minimum record length was 77.
      The maximum record length was 81.
NOTE: The data set WORK.GCD has 1000 observations and 21 variables.
NOTE: DATA statement used (Total process time):
      real time           0.14 seconds
      cpu time            0.04 seconds
*/

To decide the category of every variable, we proceed with a quick analysis of the columns.


/*********** Numeric Variables ***********/
proc sql;
 create table r_a2 as
  select min(a2) as min, max(a2) as max, mean(a2) as media,
	var(a2) as var, count(distinct a2) as num_dist, count(a2) as num
  from gcd;
quit;
/*
Duration in month:
 33 distinct values on 1000 rows, min=4, max=72, mean=20.9
 X variable.
*/

proc sql;
 create table r_a5 as
  select min(a5) as min, max(a5) as max, mean(a5) as media,
	var(a5) as var, count(distinct a5) as num_dist, count(a5) as num
  from gcd;
quit;
/*
Credit amount:
 921 distinct values on 1000 rows, min=250, max=18424, mean=3271.26
 Q variable.
*/

proc sql;
 create table r_a8 as
  select min(a8) as min, max(a8) as max, mean(a8) as media,
	var(a8) as var, count(distinct a8) as num_dist, count(a8) as num
  from gcd;
quit;

title 'a8';
proc freq data=gcd notitle;
 tables a8 / missing;
run;
title;
/*
Installment rate in percentage of disposable income :
 4 distinct values on 1000 rows, min=1, max=4, mean=2.973
 O variable.
*/

proc sql;
 create table r_a11 as
  select min(a11) as min, max(a11) as max, mean(a11) as media,
	var(a11) as var, count(distinct a11) as num_dist, count(a11) as num
  from gcd;
quit;

title 'a11';
proc freq data=gcd notitle;
 tables a11 / missing;
run;
title;
/*
Present residence since :
 4 distinct values on 1000 rows, min=1, max=4, mean=2.845
 O variable.
*/

proc sql;
 create table r_a13 as
  select min(a13) as min, max(a13) as max, mean(a13) as media,
	var(a13) as var, count(distinct a13) as num_dist, count(a13) as num
  from gcd;
quit;
/*
Age in years:
 53 distinct values on 1000 rows, min=19, max=75, mean=35.5
 X variable.
*/

proc sql;
 create table r_a16 as
  select min(a16) as min, max(a16) as max, mean(a16) as media,
	var(a16) as var, count(distinct a16) as num_dist, count(a16) as num
  from gcd;
quit;

title 'a16';
proc freq data=gcd notitle;
 tables a16 / missing;
run;
title;
/*
Number of existing credits at this bank:
 4 distinct values on 1000 rows, min=1, max=4, mean=1.4
 O variable.
*/

proc sql;
 create table r_a18 as
  select min(a18) as min, max(a18) as max, mean(a18) as media,
	var(a18) as var, count(distinct a18) as num_dist, count(a18) as num
  from gcd;
quit;

title 'a18';
proc freq data=gcd notitle;
 tables a18 / missing;
run;
title;
/*
Number of people being liable to provide maintenance for:
 2 distinct values on 1000 rows, min=1, max=2, mean=1.155
 C variable.
*/

proc sql;
 create table r_a21 as
  select min(a21) as min, max(a21) as max, mean(a21) as media,
	var(a21) as var, count(distinct a21) as num_dist, count(a21) as num
  from gcd;
quit;
/*
Target variable (1=Good, 2=Bad):
 2 distinct values on 1000 rows, min=1, max=2, mean=1.3
 R variable.
*/



/*********** Character Variables ***********/
proc sql;
 create table r2_a1 as
  select a1, count(*) as num
  from gcd
  group by a1;
quit;
/*
Status of existing checking account:
 Num: OK
 NOTE: Table WORK.R2_A1 created, with 4 rows and 2 columns.
 O variable (to convert in numeric format and insert A14 as particular value).
*/

proc sql;
 create table r2_a3 as
  select a3, count(*) as num
  from gcd
  group by a3;
quit;
/*
Credit history:
 Num: OK
 NOTE: Table WORK.R2_A3 created, with 5 rows and 2 columns.
 K variable.
*/

proc sql;
 create table r2_a4 as
  select a4, count(*) as num
  from gcd
  group by a4;
quit;
/*
Purpose:
 Num: Some low frequency
 NOTE: Table WORK.R2_A4 created, with 10 rows and 2 columns.
 K variable.
*/

proc sql;
 create table r2_a6 as
  select a6, count(*) as num
  from gcd
  group by a6;
quit;
/*
Savings account/bonds:
 Num: OK
 NOTE: Table WORK.R2_A6 created, with 5 rows and 2 columns.
 O variable (to convert in numeric format and insert A65 as particular value).
*/

proc sql;
 create table r2_a7 as
  select a7, count(*) as num
  from gcd
  group by a7;
quit;
/*
Present employment since:
 Num: OK
 NOTE: Table WORK.R2_A7 created, with 5 rows and 2 columns.
 O variable (we must convert it in numeric format).
*/

proc sql;
 create table r2_a9 as
  select a9, count(*) as num
  from gcd
  group by a9;
quit;
/*
Personal status and sex:
 Num: OK
 NOTE: Table WORK.R2_A9 created, with 4 rows and 2 columns.
 C variable.
*/

proc sql;
 create table r2_a10 as
  select a10, count(*) as num
  from gcd
  group by a10;
quit;
/*
Other debtors/guarantors:
 Num: OK
 NOTE: Table WORK.R2_A10 created, with 3 rows and 2 columns.
 C variable.
*/

proc sql;
 create table r2_a12 as
  select a12, count(*) as num
  from gcd
  group by a12;
quit;
/*
Property:
 Num: OK
 NOTE: Table WORK.R2_A12 created, with 4 rows and 2 columns.
 C variable.
*/

proc sql;
 create table r2_a14 as
  select a14, count(*) as num
  from gcd
  group by a14;
quit;
/*
Other installment plans:
 Num: OK
 NOTE: Table WORK.R2_A14 created, with 3 rows and 2 columns.
 C variable.
*/

proc sql;
 create table r2_a15 as
  select a15, count(*) as num
  from gcd
  group by a15;
quit;
/*
Housing:
 Num: OK
 NOTE: Table WORK.R2_A15 created, with 3 rows and 2 columns.
 C variable.
*/

proc sql;
 create table r2_a17 as
  select a17, count(*) as num
  from gcd
  group by a17;
quit;
/*
Job:
 Num: A171 with only 22 rows, OK all the other classes
 NOTE: Table WORK.R2_A17 created, with 4 rows and 2 columns.
 K variable.
*/

proc sql;
 create table r2_a19 as
  select a19, count(*) as num
  from gcd
  group by a19;
quit;
/*
Telephone:
 Num: OK
 NOTE: Table WORK.R2_A19 created, with 2 rows and 2 columns.
 C variable.
*/

proc sql;
 create table r2_a20 as
  select a20, count(*) as num
  from gcd
  group by a20;
quit;
/*
Foreign worker:
 Num: OK
 NOTE: Table WORK.R2_A20 created, with 2 rows and 2 columns.
 C variable.
*/


Part two - Modifying data
We proceed with changes to O and R variables.


data gcd2;
set gcd;
 /* We convert a1 (O) in numeric variable */
 if compress(upcase(a1)) = 'A11' then a1b = 1;
  else if compress(upcase(a1)) = 'A12' then a1b = 2;
   else if compress(upcase(a1)) = 'A13' then a1b = 3;
    else if compress(upcase(a1)) = 'A14' then a1b = 4;
     else a1b = 99;
 /* We convert a6 (O) in numeric variable */
 if compress(upcase(a6)) = 'A61' then a6b = 1;
  else if compress(upcase(a6)) = 'A62' then a6b = 2;
   else if compress(upcase(a6)) = 'A63' then a6b = 3;
    else if compress(upcase(a6)) = 'A64' then a6b = 4;
     else if compress(upcase(a6)) = 'A65' then a6b = 5;
      else a6b = 99;
 /* We convert a7 (O) in numeric variable */
 if compress(upcase(a7)) = 'A71' then a7b = 1;
  else if compress(upcase(a7)) = 'A72' then a7b = 2;
   else if compress(upcase(a7)) = 'A73' then a7b = 3;
    else if compress(upcase(a7)) = 'A74' then a7b = 4;
     else if compress(upcase(a7)) = 'A75' then a7b = 5;
      else a7b = 99;
 /* We convert a21 (R) in dummy variable */
 if a21 = 1 /* Good */ then a21b = 0;
  else if a21 = 2 /* Bad */ then a21b = 1;
run;


Part three - Creation of input data
Based on previous dataset, we create the tables required to the procedure.


/* We extract the column names from the table */
proc contents data=gcd2 out=gcd3 noprint;
run;


/* We keep only a part of the output */
data gcd3;
set gcd3;
keep name type format formatl formatd;
run;


/* We add the column concerning the use of the variable */
data gcd3;
set gcd3;
format utilizzo $1.;
 utilizzo = 'n';
/* We value the new column with our choice */
 if compress(name) = 'a1b' then utilizzo = 'o';
 if compress(name) = 'a2' then utilizzo = 'x';
 if compress(name) = 'a3' then utilizzo = 'k';
 if compress(name) = 'a4' then utilizzo = 'k';
 if compress(name) = 'a5' then utilizzo = 'q';
 if compress(name) = 'a6b' then utilizzo = 'o';
 if compress(name) = 'a7b' then utilizzo = 'o';
 if compress(name) = 'a8' then utilizzo = 'o';
 if compress(name) = 'a9' then utilizzo = 'c';
 if compress(name) = 'a10' then utilizzo = 'c';
 if compress(name) = 'a11' then utilizzo = 'o';
 if compress(name) = 'a12' then utilizzo = 'c';
 if compress(name) = 'a13' then utilizzo = 'x';
 if compress(name) = 'a14' then utilizzo = 'c';
 if compress(name) = 'a15' then utilizzo = 'c';
 if compress(name) = 'a16' then utilizzo = 'o';
 if compress(name) = 'a17' then utilizzo = 'k';
 if compress(name) = 'a18' then utilizzo = 'c';
 if compress(name) = 'a19' then utilizzo = 'c';
 if compress(name) = 'a20' then utilizzo = 'c';
 if compress(name) = 'a21b' then utilizzo = 'r';
run;


/* Here we insert the particular values for variables */
data spec;
 length variabile $ 50.;
 length condizione $ 1000.;
 length classe $ 20.;
  variabile = 'a1b';
  condizione = 'a1b = 4';
  classe = 'Missing';
 output;
  variabile = 'a1b';
  condizione = 'a1b = 99';
  classe = 'Anomalo';
 output;
  variabile = 'a6b';
  condizione = 'a6b = 5';
  classe = 'Missing';
 output;
  variabile = 'a6b';
  condizione = 'a6b = 99';
  classe = 'Anomalo';
 output;
  variabile = 'a7b';
  condizione = 'a7b = 99';
  classe = 'Anomalo';
 output;
run;


data escc;
 length var1 $ 50.;
 length var2 $ 50.;
 if _n_ < 1;
run;


/* We create the execution library */
libname loc "C:\ ... PATH ... \Data_In";

/* And we insert input files */
/* Data file */
data loc.db;
set gcd2;
run;

/* File with type of variables */
data loc.db_var;
set gcd3;
run;

/* Particular conditions file */
data loc.db_cond;
set spec;
run;

/* Conditional exclusions file */
data loc.db_esccon;
set escc;
run;


Part four - The call
After running all of the three macro definitions (simpson_c, mod_b_meno_a and classizz), we make the call.
We want to have the possibility to observe the single steps of the process, so we included in the code below some options useful for debugging (mprint, symbolgen and mlogic) and wrote the two logs generated into two distinct text files.
The remaining input parameters are entered according to our intentions.


option mprint symbolgen mlogic;

%classizz(in=loc.db, taglio_correlazione=0.4, distribuzione=binomial, alfa=0.05, passo=15,
	   log_0 = "C:\ ... PATH ... \Log_Out\Out_0.log",
	   output_log = "C:\ ... PATH ... \Log_Out\Out_1.log",
	   max_giri=0, max_format=$5000, simpson=1);

option nomprint nosymbolgen nomlogic;


Part five - The log and the output
The two logs generated are too long (over 3,000 rows and over 300,000, respectively), so they are not reported here.
On a more detailed analysis, you can have a closer look at the procedure. Even if you are not interested in such analysis, we recommend that you search into the logs for errors or warnings: the only anomalies should be those related to some sequence of invalid macro variables as a consequence of tables with no rows (see example below).


MLOGIC(CLASSIZZ):  %LET (variable name is NUORI)
SYMBOLGEN:  Macro variable NUORI resolves to        0
MPRINT(CLASSIZZ):   proc sql noprint;
SYMBOLGEN:  Macro variable V_INSERITA resolves to CL_A1B
SYMBOLGEN:  Macro variable NUORI resolves to 0
SYMBOLGEN:  Macro variable NUORI resolves to 0
SYMBOLGEN:  Macro variable NUORI resolves to 0
SYMBOLGEN:  Macro variable IN resolves to loc.db
MPRINT(CLASSIZZ):   select CL_A1B_c, minimo, massimo into :nuov1 - :nuov0, :nuomi1 - :nuomi0, :nuoma1 
- :nuoma0 from loc.db_nuoco2;
WARNING: INTO Clause :nuov1 thru :nuov0 does not specify a valid sequence of macro variables.
WARNING: INTO Clause :nuomi1 thru :nuomi0 does not specify a valid sequence of macro variables.
WARNING: INTO Clause :nuoma1 thru :nuoma0 does not specify a valid sequence of macro variables.
NOTE: No rows were selected.
MPRINT(CLASSIZZ):   quit;
NOTE: PROCEDURE SQL used (Total process time):
      real time           0.00 seconds
      cpu time            0.00 seconds

Output files (as described in dedicated page) are a lot.
The most interesting ones from a statistical point of view concern the final model, but also summary tables are very useful for applications.
The first group includes the estimated model (dataset &in._mcorr):


Parameter Level1 DF Estimate StdErr LowerWaldCL UpperWaldCL ChiSq ProbChiSq
INTERCEPT   1 0.0871419419 0.6398 -1.1667 1.3410 0.02 0.8917
CL_A1B_C -1 1 -0.6878394348 0.3595 -1.3925 0.0168 3.66 0.0557
CL_A1B_C 1 1 0.9365198021 0.3453 0.2597 1.6133 7.36 0.0067
CL_A1B_C 2 1 0.5423718235 0.3486 -0.1408 1.2256 2.42 0.1197
CL_A1B_C 3 0 0.0000000000 0.0000 0.0000 0.0000 . .
CL_K_A3_C 1 1 -1.6149056559 0.2974 -2.1978 -1.0320 29.48 <.0001
CL_K_A3_C 2 1 -0.9109392644 0.2619 -1.4242 -0.3977 12.10 0.0005
CL_K_A3_C 4 0 0.0000000000 0.0000 0.0000 0.0000 . .
CL_A2_C 1 1 -1.5230586715 0.2740 -2.0600 -0.9861 30.91 <.0001
CL_A2_C 3 1 -0.7443621414 0.1857 -1.1083 -0.3805 16.07 <.0001
CL_A2_C 7 0 0.0000000000 0.0000 0.0000 0.0000 . .
CL_K_A4_C 1 1 -1.0327863506 0.1948 -1.4146 -0.6509 28.10 <.0001
CL_K_A4_C 4 1 -0.6236875600 0.1975 -1.0108 -0.2366 9.97 0.0016
CL_K_A4_C 6 0 0.0000000000 0.0000 0.0000 0.0000 . .
CL_A6B_C -3 1 0.2796669659 0.5191 -0.7378 1.2972 0.29 0.5901
CL_A6B_C 1 1 1.0564306705 0.4790 0.1176 1.9952 4.86 0.0274
CL_A6B_C 4 0 0.0000000000 0.0000 0.0000 0.0000 . .
CL_A13_C 1 1 0.5725908596 0.1914 0.1974 0.9478 8.95 0.0028
CL_A13_C 3 0 0.0000000000 0.0000 0.0000 0.0000 . .
SCALE   0 1.0000000000 0.0000 1.0000 1.0000 _ _


and the measures of goodness of fit (dataset &in._smcorr):


Criterion DF Value ValueDF
Deviance 987 974.5303 0.9874
Scaled Deviance 987 974.5303 0.9874
Pearson Chi-Square 987 992.7954 1.0059
Scaled Pearson X2 987 992.7954 1.0059
Log Likelihood _ -487.2651 _
AIC _ 1000.5303 _


In the second group, we can learn about the choice of variables step by step with the dataset &in._passi (below you find in bold the variable chosen at each step):


Passo Modello
1 CL_A1B
2 CL_A1B CL_K_A3
3 CL_A1B CL_A2 CL_K_A3
4 CL_A1B CL_A2 CL_K_A3 CL_K_A4
5 CL_A1B CL_A2 CL_A6B CL_K_A3 CL_K_A4
6 CL_A13 CL_A1B CL_A2 CL_A6B CL_K_A3 CL_K_A4


If you want to apply the obtained model to new data, you can use the table &in._zgri:


Nome Level1 Kvar_d Estimate Condizione DF Utilizzo
A13 3 25 < A13 <= Max 0.0000000000 25 < A13 0 X
A13 1 Min < A13 <= 25 0.5725908596 A13 <= 25 1 X
A1B 3 3 0.0000000000 A1B = 3 0 O
A1B 2 2 0.5423718235 A1B = 2 1 O
A1B 1 1 0.9365198021 A1B = 1 1 O
A1B -1 Missing -0.6878394348 a1b = 4 1 O
A2 7 24 < A2 <= Max 0.0000000000 24 < A2 0 X
A2 3 10 < A2 <= 24 -0.7443621414 10 < A2 <= 24 1 X
A2 1 Min < A2 <= 10 -1.5230586715 A2 <= 10 1 X
A3 4   0.0000000000 A3 in ('A31', 'A30') 0 K
A3 2   -0.9109392644 A3 in ('A33', 'A32') 1 K
A3 1   -1.6149056559 A3 in ('A34') 1 K
A4 6   0.0000000000 A4 in ('A45', 'A40', 'A410', 'A46') 0 K
A4 4   -0.6236875600 A4 in ('A42', 'A44', 'A49') 1 K
A4 1   -1.0327863506 A4 in ('A48', 'A41', 'A43') 1 K
A6B 4 4 0.0000000000 A6B = 4 0 O
A6B 1 1 <= CL_A6B <= 3 1.0564306705 1 <= A6B <= 3 1 O
A6B -3 Missing 0.2796669659 a6b = 5 1 O
INTERCEPT     0.0871419419 1 1  



Part Six - Possible use of output files: check the accuracy of the model
The tables described above (and the others output datasets) can be used to observe the model found or to apply it to new data.
In the code below, we present one of their possible uses: in particular we'll use table &in._zgri in order to apply the model found on a dataset (to make it easier, it will be the same dataset used in estimation) in order to evaluate accuracy of our predictions.
This test will be supported by the ROC curve of the estimations: the macro that generates it, will not be shown here (try and write it down!).


/* We copy files on work library */
libname locale "C:\ ... PATH ... \Data_In";

data db;
set locale.db_dcorr;
run;

data zgri;
set locale.db_zgri;
run;

libname locale;


/* We take the coefficients of the variables */
proc sql noprint;
 select count(*) into :c
 from zgri;
quit;

proc sql noprint;
 select count(distinct nome) into :cn
 from zgri;
quit;

%let c = &c;
%let cn = &cn;

proc sql noprint;
 select nome, estimate, condizione, utilizzo into
	:nome1 - :nome&c, :stima1 - :stima&c, :cond1 - :cond&c, :util1 - :util&c
 from zgri;
quit;

proc sql noprint;
 select distinct nome into :nom1 - :nom&cn
 from zgri;
quit;



%macro giro(dsin=db);

data &dsin.2;
set &dsin;
/* Inizialization */
 %do i=1 %to &cn;
  &&nom&i.._b = 0;
 %end;
/* Valorization --> N.B.: particulars conditions overwrite normal conditions */
 %do i=1 %to &c;
  %if "&&util&i" = "X" or "&&util&i" = "O" or "&&util&i" = "C" or "&&util&i" = "K" %then %do;
    if &&cond&i then &&nome&i.._b = &&stima&i;
   %end;
  %if "&&util&i" = "Q" %then %do;
    &&nome&i.._b = &&stima&i * &&nome&i;
   %end;
  %if "&&util&i" = "" %then %do;	/* Intercept */
    &&nome&i.._b = &&stima&i;
   %end;
 %end;
/* Now we do the sum */
 xbeta = sum(
	%do i=1 %to &cn;
	  &&nom&i.._b,
	 %end;
	  0);
/* And we calculate the probability value */
 prob = exp(xbeta) / (1 + exp(xbeta));
run;

%mend giro;


%giro(dsin=db);
/*
NOTE: There were 1000 observations read from the data set WORK.DB.
NOTE: The data set WORK.DB2 has 1000 observations and 71 variables.
*/


/*****************************/
/* Check of calculated score */
/*****************************/

data ddbb;
set db2;
 diff = predetti - prob;
 if diff ^= 0;
run;

proc sort data=ddbb;
 by diff;
run;

proc sql;
 select min(diff) format=commax30.15 as min,
	max(diff) format=commax30.15 as max
 from ddbb;
quit;

proc delete data=ddbb;
run;

/************** OK **************/


/* We do the ROC curve for the model */
data db3;
set db2;
 if a21b = 1 then do;
   si = 1;
   no = 0;
  end;
 if a21b = 0 then do;
   si = 0;
   no = 1;
  end;
 peso = 1;
run;
/*
NOTE: There were 1000 observations read from the data set WORK.DB2.
NOTE: The data set WORK.DB3 has 1000 observations and 74 variables.
*/

%let ca = C:\ ... PATH ... \Esempio;

%ROC(in=db3, buoni=no, cattivi=si, stima=prob, peso=peso, punti_bad=alti,
     migliore=vicin_perf, alpha=1.96, cartella_out="&ca", nomefile="ROC");

The resulting ROC curve is:

Curva ROC di esempio

We close our analysis with a comparison between estimated and real cases, using the number suggested in the previous chart or the most simple 50% as threshold to determine target values from probability.


/* We try the 50% threshold */
data db4;
set db3;
 if prob > 0.5 then stim = 1;
  else stim = 0;
run;

proc freq data=db4;
 tables a21b*stim / missing norow nocol nopercent;
run;
/*
		stim
	a21b	0,	1,	Total
	0,	618,	82,	700
	1,	160,	140,	300
	Total	778	222	1000
*/


/* 29.5275% threshold */
data db4;
set db3;
 if prob > 0.295275 then stim = 1;
  else stim = 0;
run;

proc freq data=db4;
 tables a21b*stim / missing norow nocol nopercent;
run;
/*
		stim
	a21b	0,	1,	Total
	0,	511,	189,	700
	1,	75,	225,	300
	Total	586	414	1000
*/

In the first case (50% as threshold), we have a value of sensitivity equal to 140/300 = 46.7% and a specificity of 618/700 = 88.3%.
In the second case (29.5275% as threshold), the sensitivity is at 225/300 = 75% and specificity at 511/700 = 73%.


Part seven - Using table &in._passi to direct the process
When we search for a regression model, we are likely to force some variables to enter or, sometimes, the data is so big that we must stop the search after some hours and restart it some time later.
To help us in these and other cases, we can use the input parameter passi: to conclude the example we'll show how to use the output table &in._passi for that purpose.


Passo Modello
1 CL_A1B
2 CL_A1B CL_K_A3
3 CL_A1B CL_A2 CL_K_A3
4 CL_A1B CL_A2 CL_K_A3 CL_K_A4
5 CL_A1B CL_A2 CL_A6B CL_K_A3 CL_K_A4
6 CL_A13 CL_A1B CL_A2 CL_A6B CL_K_A3 CL_K_A4


In the code below, we'll create the same tables of first round input, with the addition of the &passi table, created from the output table &in._passi (shown above).


/* We assign the first round library */
libname old "C:\ ... PATH ... \Data_In";

/* We assign the actual round library */
libname new "C:\ ... PATH ... \Data_In2";


/* We copy the original dataset */
data new.db;
set old.db;
run;


/* As in the first round, we create input files */

/* We extract the names of the columns from the table */
proc contents data=old.db out=gcd3 noprint;
run;


/* We keep only some variables */
data gcd3;
set gcd3;
keep name type format formatl formatd;
run;


/* We insert the information about use of variables */
data gcd3;
set gcd3;
format utilizzo $1.;
 utilizzo = 'n';
 if compress(name) = 'a1b' then utilizzo = 'o';
 if compress(name) = 'a2' then utilizzo = 'x';
 if compress(name) = 'a3' then utilizzo = 'k';
 if compress(name) = 'a4' then utilizzo = 'k';
 if compress(name) = 'a5' then utilizzo = 'q';
 if compress(name) = 'a6b' then utilizzo = 'o';
 if compress(name) = 'a7b' then utilizzo = 'o';
 if compress(name) = 'a8' then utilizzo = 'o';
 if compress(name) = 'a9' then utilizzo = 'c';
 if compress(name) = 'a10' then utilizzo = 'c';
 if compress(name) = 'a11' then utilizzo = 'o';
 if compress(name) = 'a12' then utilizzo = 'c';
 if compress(name) = 'a13' then utilizzo = 'x';
 if compress(name) = 'a14' then utilizzo = 'c';
 if compress(name) = 'a15' then utilizzo = 'c';
 if compress(name) = 'a16' then utilizzo = 'o';
 if compress(name) = 'a17' then utilizzo = 'k';
 if compress(name) = 'a18' then utilizzo = 'c';
 if compress(name) = 'a19' then utilizzo = 'c';
 if compress(name) = 'a20' then utilizzo = 'c';
 if compress(name) = 'a21b' then utilizzo = 'r';
run;


/* We insert particular values for variables */
data spec;
 length variabile $ 50.;
 length condizione $ 1000.;
 length classe $ 20.;
  variabile = 'a1b';
  condizione = 'a1b = 4';
  classe = 'Missing';
 output;
  variabile = 'a1b';
  condizione = 'a1b = 99';
  classe = 'Anomalo';
 output;
  variabile = 'a6b';
  condizione = 'a6b = 5';
  classe = 'Missing';
 output;
  variabile = 'a6b';
  condizione = 'a6b = 99';
  classe = 'Anomalo';
 output;
  variabile = 'a7b';
  condizione = 'a7b = 99';
  classe = 'Anomalo';
 output;
run;


data escc;
 length var1 $ 50.;
 length var2 $ 50.;
 if _n_ < 1;
run;


/* Dataset with variables and their use */
data new.db_var;
set gcd3;
run;

/* Dataset with particular conditions */
data new.db_cond;
set spec;
run;

/* Dataset of conditional exclusions */
data new.db_esccon;
set escc;
run;


/* We delete temporary files */
proc delete data=gcd3 spec escc;
run;


/* We start from db_passi file (generated in output of first round) and
    we create a new file to direct the steps of the second round. */
data new.passi;
set old.db_passi;
 if passo <= 2;
run;


/* The next step of first round was the insertion of "CL_A2" column in the model.
    We'll try to insert variable "CL_K_A4" */
proc sql;
 insert into new.passi values (3, "CL_A1B CL_K_A3 CL_K_A4");
quit;

/*
N.B.: The variables in this table must not have the original name, but
      the derivated one, as described in type of variables page
 */


/* We deassign the old libraries and we assign a new one */
libname old;
libname new;
libname locale "C:\ ... PATH ... \Data_In2";


/* Now we recall the macro adding in input the parameter "passi" created before */

option mprint symbolgen mlogic;

%classizz(in=locale.db, taglio_correlazione=0.4, distribuzione=binomial, alfa=0.05, passo=15,
 log_0 = "C:\ ... PATH ... \Out_0.log",
 output_log = "C:\ ... PATH ... \Out_1.log",
 max_giri=0, max_format=$5000, simpson=1, passi=locale.passi);

option nomprint nosymbolgen nomlogic;

In this case the procedure takes in input the table locale.passi (below, on the left) and generates a flow similar to the previous one, except for third and fourth step that are inverted (below, on the right).
Note that, in this example, the final model is the same in the two executions, but normally this is not what happens: by taking two different paths we can obtain different models also with the same data.
The procedure favors the variables set in input by user, but don't force them to enter in the model if they are not statistically relevant.


Passo Modello
1 CL_A1B
2 CL_A1B CL_K_A3
3 CL_A1B CL_K_A3 CL_K_A4


Passo Modello
1 CL_A1B
2 CL_A1B CL_K_A3
3 CL_A1B CL_K_A3 CL_K_A4
4 CL_A1B CL_A2 CL_K_A3 CL_K_A4
5 CL_A1B CL_A2 CL_A6B CL_K_A3 CL_K_A4
6 CL_A13 CL_A1B CL_A2 CL_A6B CL_K_A3 CL_K_A4





  Main index     Programs index     Autoreg index  
Vai alla versione Italiana

Creation date: 17 Sep 2010
Translation date: 30 Dec 2012
Last change: 20 Apr 2013

Translation reviewed by Giulia Di Lallo