Part one - Loading and fast data analysis
First of all, we create the input sas table from
data file.
filename stlog 'C:\ ... PATH ... \german.data'; data gcd; infile stlog; input a1 $ a2 a3 $ a4 $ a5 a6 $ a7 $ a8 a9 $ a10 $ a11 a12 $ a13 a14 $ a15 $ a16 a17 $ a18 a19 $ a20 $ a21; run; /* NOTE: 1000 records were read from the infile STLOG. The minimum record length was 77. The maximum record length was 81. NOTE: The data set WORK.GCD has 1000 observations and 21 variables. NOTE: DATA statement used (Total process time): real time 0.14 seconds cpu time 0.04 seconds */ |
/*********** Numeric Variables ***********/ proc sql; create table r_a2 as select min(a2) as min, max(a2) as max, mean(a2) as media, var(a2) as var, count(distinct a2) as num_dist, count(a2) as num from gcd; quit; /* Duration in month: 33 distinct values on 1000 rows, min=4, max=72, mean=20.9 X variable. */ proc sql; create table r_a5 as select min(a5) as min, max(a5) as max, mean(a5) as media, var(a5) as var, count(distinct a5) as num_dist, count(a5) as num from gcd; quit; /* Credit amount: 921 distinct values on 1000 rows, min=250, max=18424, mean=3271.26 Q variable. */ proc sql; create table r_a8 as select min(a8) as min, max(a8) as max, mean(a8) as media, var(a8) as var, count(distinct a8) as num_dist, count(a8) as num from gcd; quit; title 'a8'; proc freq data=gcd notitle; tables a8 / missing; run; title; /* Installment rate in percentage of disposable income : 4 distinct values on 1000 rows, min=1, max=4, mean=2.973 O variable. */ proc sql; create table r_a11 as select min(a11) as min, max(a11) as max, mean(a11) as media, var(a11) as var, count(distinct a11) as num_dist, count(a11) as num from gcd; quit; title 'a11'; proc freq data=gcd notitle; tables a11 / missing; run; title; /* Present residence since : 4 distinct values on 1000 rows, min=1, max=4, mean=2.845 O variable. */ proc sql; create table r_a13 as select min(a13) as min, max(a13) as max, mean(a13) as media, var(a13) as var, count(distinct a13) as num_dist, count(a13) as num from gcd; quit; /* Age in years: 53 distinct values on 1000 rows, min=19, max=75, mean=35.5 X variable. */ proc sql; create table r_a16 as select min(a16) as min, max(a16) as max, mean(a16) as media, var(a16) as var, count(distinct a16) as num_dist, count(a16) as num from gcd; quit; title 'a16'; proc freq data=gcd notitle; tables a16 / missing; run; title; /* Number of existing credits at this bank: 4 distinct values on 1000 rows, min=1, max=4, mean=1.4 O variable. */ proc sql; create table r_a18 as select min(a18) as min, max(a18) as max, mean(a18) as media, var(a18) as var, count(distinct a18) as num_dist, count(a18) as num from gcd; quit; title 'a18'; proc freq data=gcd notitle; tables a18 / missing; run; title; /* Number of people being liable to provide maintenance for: 2 distinct values on 1000 rows, min=1, max=2, mean=1.155 C variable. */ proc sql; create table r_a21 as select min(a21) as min, max(a21) as max, mean(a21) as media, var(a21) as var, count(distinct a21) as num_dist, count(a21) as num from gcd; quit; /* Target variable (1=Good, 2=Bad): 2 distinct values on 1000 rows, min=1, max=2, mean=1.3 R variable. */ /*********** Character Variables ***********/ proc sql; create table r2_a1 as select a1, count(*) as num from gcd group by a1; quit; /* Status of existing checking account: Num: OK NOTE: Table WORK.R2_A1 created, with 4 rows and 2 columns. O variable (to convert in numeric format and insert A14 as particular value). */ proc sql; create table r2_a3 as select a3, count(*) as num from gcd group by a3; quit; /* Credit history: Num: OK NOTE: Table WORK.R2_A3 created, with 5 rows and 2 columns. K variable. */ proc sql; create table r2_a4 as select a4, count(*) as num from gcd group by a4; quit; /* Purpose: Num: Some low frequency NOTE: Table WORK.R2_A4 created, with 10 rows and 2 columns. K variable. */ proc sql; create table r2_a6 as select a6, count(*) as num from gcd group by a6; quit; /* Savings account/bonds: Num: OK NOTE: Table WORK.R2_A6 created, with 5 rows and 2 columns. O variable (to convert in numeric format and insert A65 as particular value). */ proc sql; create table r2_a7 as select a7, count(*) as num from gcd group by a7; quit; /* Present employment since: Num: OK NOTE: Table WORK.R2_A7 created, with 5 rows and 2 columns. O variable (we must convert it in numeric format). */ proc sql; create table r2_a9 as select a9, count(*) as num from gcd group by a9; quit; /* Personal status and sex: Num: OK NOTE: Table WORK.R2_A9 created, with 4 rows and 2 columns. C variable. */ proc sql; create table r2_a10 as select a10, count(*) as num from gcd group by a10; quit; /* Other debtors/guarantors: Num: OK NOTE: Table WORK.R2_A10 created, with 3 rows and 2 columns. C variable. */ proc sql; create table r2_a12 as select a12, count(*) as num from gcd group by a12; quit; /* Property: Num: OK NOTE: Table WORK.R2_A12 created, with 4 rows and 2 columns. C variable. */ proc sql; create table r2_a14 as select a14, count(*) as num from gcd group by a14; quit; /* Other installment plans: Num: OK NOTE: Table WORK.R2_A14 created, with 3 rows and 2 columns. C variable. */ proc sql; create table r2_a15 as select a15, count(*) as num from gcd group by a15; quit; /* Housing: Num: OK NOTE: Table WORK.R2_A15 created, with 3 rows and 2 columns. C variable. */ proc sql; create table r2_a17 as select a17, count(*) as num from gcd group by a17; quit; /* Job: Num: A171 with only 22 rows, OK all the other classes NOTE: Table WORK.R2_A17 created, with 4 rows and 2 columns. K variable. */ proc sql; create table r2_a19 as select a19, count(*) as num from gcd group by a19; quit; /* Telephone: Num: OK NOTE: Table WORK.R2_A19 created, with 2 rows and 2 columns. C variable. */ proc sql; create table r2_a20 as select a20, count(*) as num from gcd group by a20; quit; /* Foreign worker: Num: OK NOTE: Table WORK.R2_A20 created, with 2 rows and 2 columns. C variable. */ |
Part two - Modifying data
We proceed with changes to O and R variables.
data gcd2; set gcd; /* We convert a1 (O) in numeric variable */ if compress(upcase(a1)) = 'A11' then a1b = 1; else if compress(upcase(a1)) = 'A12' then a1b = 2; else if compress(upcase(a1)) = 'A13' then a1b = 3; else if compress(upcase(a1)) = 'A14' then a1b = 4; else a1b = 99; /* We convert a6 (O) in numeric variable */ if compress(upcase(a6)) = 'A61' then a6b = 1; else if compress(upcase(a6)) = 'A62' then a6b = 2; else if compress(upcase(a6)) = 'A63' then a6b = 3; else if compress(upcase(a6)) = 'A64' then a6b = 4; else if compress(upcase(a6)) = 'A65' then a6b = 5; else a6b = 99; /* We convert a7 (O) in numeric variable */ if compress(upcase(a7)) = 'A71' then a7b = 1; else if compress(upcase(a7)) = 'A72' then a7b = 2; else if compress(upcase(a7)) = 'A73' then a7b = 3; else if compress(upcase(a7)) = 'A74' then a7b = 4; else if compress(upcase(a7)) = 'A75' then a7b = 5; else a7b = 99; /* We convert a21 (R) in dummy variable */ if a21 = 1 /* Good */ then a21b = 0; else if a21 = 2 /* Bad */ then a21b = 1; run; |
Part three - Creation of input data
Based on previous dataset, we create the tables required to the procedure.
/* We extract the column names from the table */ proc contents data=gcd2 out=gcd3 noprint; run; /* We keep only a part of the output */ data gcd3; set gcd3; keep name type format formatl formatd; run; /* We add the column concerning the use of the variable */ data gcd3; set gcd3; format utilizzo $1.; utilizzo = 'n'; /* We value the new column with our choice */ if compress(name) = 'a1b' then utilizzo = 'o'; if compress(name) = 'a2' then utilizzo = 'x'; if compress(name) = 'a3' then utilizzo = 'k'; if compress(name) = 'a4' then utilizzo = 'k'; if compress(name) = 'a5' then utilizzo = 'q'; if compress(name) = 'a6b' then utilizzo = 'o'; if compress(name) = 'a7b' then utilizzo = 'o'; if compress(name) = 'a8' then utilizzo = 'o'; if compress(name) = 'a9' then utilizzo = 'c'; if compress(name) = 'a10' then utilizzo = 'c'; if compress(name) = 'a11' then utilizzo = 'o'; if compress(name) = 'a12' then utilizzo = 'c'; if compress(name) = 'a13' then utilizzo = 'x'; if compress(name) = 'a14' then utilizzo = 'c'; if compress(name) = 'a15' then utilizzo = 'c'; if compress(name) = 'a16' then utilizzo = 'o'; if compress(name) = 'a17' then utilizzo = 'k'; if compress(name) = 'a18' then utilizzo = 'c'; if compress(name) = 'a19' then utilizzo = 'c'; if compress(name) = 'a20' then utilizzo = 'c'; if compress(name) = 'a21b' then utilizzo = 'r'; run; /* Here we insert the particular values for variables */ data spec; length variabile $ 50.; length condizione $ 1000.; length classe $ 20.; variabile = 'a1b'; condizione = 'a1b = 4'; classe = 'Missing'; output; variabile = 'a1b'; condizione = 'a1b = 99'; classe = 'Anomalo'; output; variabile = 'a6b'; condizione = 'a6b = 5'; classe = 'Missing'; output; variabile = 'a6b'; condizione = 'a6b = 99'; classe = 'Anomalo'; output; variabile = 'a7b'; condizione = 'a7b = 99'; classe = 'Anomalo'; output; run; data escc; length var1 $ 50.; length var2 $ 50.; if _n_ < 1; run; /* We create the execution library */ libname loc "C:\ ... PATH ... \Data_In"; /* And we insert input files */ /* Data file */ data loc.db; set gcd2; run; /* File with type of variables */ data loc.db_var; set gcd3; run; /* Particular conditions file */ data loc.db_cond; set spec; run; /* Conditional exclusions file */ data loc.db_esccon; set escc; run; |
Part four - The call
After running all of the three macro definitions (simpson_c, mod_b_meno_a
and classizz), we make the call.
We want to have the possibility to observe the single steps of the process, so we included in the code
below some options useful for debugging (mprint, symbolgen and mlogic)
and wrote the two logs generated into two distinct text files.
The remaining input parameters are entered according to our intentions.
option mprint symbolgen mlogic; %classizz(in=loc.db, taglio_correlazione=0.4, distribuzione=binomial, alfa=0.05, passo=15, log_0 = "C:\ ... PATH ... \Log_Out\Out_0.log", output_log = "C:\ ... PATH ... \Log_Out\Out_1.log", max_giri=0, max_format=$5000, simpson=1); option nomprint nosymbolgen nomlogic; |
Part five - The log and the output
The two logs generated are too long (over 3,000 rows and over 300,000, respectively),
so they are not reported here.
On a more detailed analysis, you can have a closer look at the procedure.
Even if you are not interested in such analysis, we recommend that you search into the logs
for errors or warnings: the only
anomalies should be those related to some sequence of invalid macro variables as a consequence of tables
with no rows (see example below).
MLOGIC(CLASSIZZ): %LET (variable name is NUORI) SYMBOLGEN: Macro variable NUORI resolves to 0 MPRINT(CLASSIZZ): proc sql noprint; SYMBOLGEN: Macro variable V_INSERITA resolves to CL_A1B SYMBOLGEN: Macro variable NUORI resolves to 0 SYMBOLGEN: Macro variable NUORI resolves to 0 SYMBOLGEN: Macro variable NUORI resolves to 0 SYMBOLGEN: Macro variable IN resolves to loc.db MPRINT(CLASSIZZ): select CL_A1B_c, minimo, massimo into :nuov1 - :nuov0, :nuomi1 - :nuomi0, :nuoma1 - :nuoma0 from loc.db_nuoco2; WARNING: INTO Clause :nuov1 thru :nuov0 does not specify a valid sequence of macro variables. WARNING: INTO Clause :nuomi1 thru :nuomi0 does not specify a valid sequence of macro variables. WARNING: INTO Clause :nuoma1 thru :nuoma0 does not specify a valid sequence of macro variables. NOTE: No rows were selected. MPRINT(CLASSIZZ): quit; NOTE: PROCEDURE SQL used (Total process time): real time 0.00 seconds cpu time 0.00 seconds |
|
|
|
|
Part Six - Possible use of output files: check the accuracy of the model
The tables described above (and the others output datasets)
can be used to observe the model found or to apply it to new data.
In the code below, we present one of their possible uses: in particular we'll use table
&in._zgri in order to apply the model found on a
dataset (to make it easier, it will be the same dataset used in estimation)
in order to evaluate accuracy of our predictions.
This test will be supported by the
ROC curve
of the estimations: the macro that generates it, will not be shown here (try and write it down!).
/* We copy files on work library */ libname locale "C:\ ... PATH ... \Data_In"; data db; set locale.db_dcorr; run; data zgri; set locale.db_zgri; run; libname locale; /* We take the coefficients of the variables */ proc sql noprint; select count(*) into :c from zgri; quit; proc sql noprint; select count(distinct nome) into :cn from zgri; quit; %let c = &c; %let cn = &cn; proc sql noprint; select nome, estimate, condizione, utilizzo into :nome1 - :nome&c, :stima1 - :stima&c, :cond1 - :cond&c, :util1 - :util&c from zgri; quit; proc sql noprint; select distinct nome into :nom1 - :nom&cn from zgri; quit; %macro giro(dsin=db); data &dsin.2; set &dsin; /* Inizialization */ %do i=1 %to &cn; &&nom&i.._b = 0; %end; /* Valorization --> N.B.: particulars conditions overwrite normal conditions */ %do i=1 %to &c; %if "&&util&i" = "X" or "&&util&i" = "O" or "&&util&i" = "C" or "&&util&i" = "K" %then %do; if &&cond&i then &&nome&i.._b = &&stima&i; %end; %if "&&util&i" = "Q" %then %do; &&nome&i.._b = &&stima&i * &&nome&i; %end; %if "&&util&i" = "" %then %do; /* Intercept */ &&nome&i.._b = &&stima&i; %end; %end; /* Now we do the sum */ xbeta = sum( %do i=1 %to &cn; &&nom&i.._b, %end; 0); /* And we calculate the probability value */ prob = exp(xbeta) / (1 + exp(xbeta)); run; %mend giro; %giro(dsin=db); /* NOTE: There were 1000 observations read from the data set WORK.DB. NOTE: The data set WORK.DB2 has 1000 observations and 71 variables. */ /*****************************/ /* Check of calculated score */ /*****************************/ data ddbb; set db2; diff = predetti - prob; if diff ^= 0; run; proc sort data=ddbb; by diff; run; proc sql; select min(diff) format=commax30.15 as min, max(diff) format=commax30.15 as max from ddbb; quit; proc delete data=ddbb; run; /************** OK **************/ /* We do the ROC curve for the model */ data db3; set db2; if a21b = 1 then do; si = 1; no = 0; end; if a21b = 0 then do; si = 0; no = 1; end; peso = 1; run; /* NOTE: There were 1000 observations read from the data set WORK.DB2. NOTE: The data set WORK.DB3 has 1000 observations and 74 variables. */ %let ca = C:\ ... PATH ... \Esempio; %ROC(in=db3, buoni=no, cattivi=si, stima=prob, peso=peso, punti_bad=alti, migliore=vicin_perf, alpha=1.96, cartella_out="&ca", nomefile="ROC"); |
/* We try the 50% threshold */ data db4; set db3; if prob > 0.5 then stim = 1; else stim = 0; run; proc freq data=db4; tables a21b*stim / missing norow nocol nopercent; run; /* stim a21b 0, 1, Total 0, 618, 82, 700 1, 160, 140, 300 Total 778 222 1000 */ /* 29.5275% threshold */ data db4; set db3; if prob > 0.295275 then stim = 1; else stim = 0; run; proc freq data=db4; tables a21b*stim / missing norow nocol nopercent; run; /* stim a21b 0, 1, Total 0, 511, 189, 700 1, 75, 225, 300 Total 586 414 1000 */ |
Part seven - Using table &in._passi to direct the process
When we search for a regression model, we are likely to force some variables to enter or,
sometimes, the data is so big that we must stop the search after some hours and restart it
some time later.
To help us in these and other cases, we can use the input parameter passi:
to conclude the example we'll show how to use the output table
&in._passi for that purpose.
|
/* We assign the first round library */ libname old "C:\ ... PATH ... \Data_In"; /* We assign the actual round library */ libname new "C:\ ... PATH ... \Data_In2"; /* We copy the original dataset */ data new.db; set old.db; run; /* As in the first round, we create input files */ /* We extract the names of the columns from the table */ proc contents data=old.db out=gcd3 noprint; run; /* We keep only some variables */ data gcd3; set gcd3; keep name type format formatl formatd; run; /* We insert the information about use of variables */ data gcd3; set gcd3; format utilizzo $1.; utilizzo = 'n'; if compress(name) = 'a1b' then utilizzo = 'o'; if compress(name) = 'a2' then utilizzo = 'x'; if compress(name) = 'a3' then utilizzo = 'k'; if compress(name) = 'a4' then utilizzo = 'k'; if compress(name) = 'a5' then utilizzo = 'q'; if compress(name) = 'a6b' then utilizzo = 'o'; if compress(name) = 'a7b' then utilizzo = 'o'; if compress(name) = 'a8' then utilizzo = 'o'; if compress(name) = 'a9' then utilizzo = 'c'; if compress(name) = 'a10' then utilizzo = 'c'; if compress(name) = 'a11' then utilizzo = 'o'; if compress(name) = 'a12' then utilizzo = 'c'; if compress(name) = 'a13' then utilizzo = 'x'; if compress(name) = 'a14' then utilizzo = 'c'; if compress(name) = 'a15' then utilizzo = 'c'; if compress(name) = 'a16' then utilizzo = 'o'; if compress(name) = 'a17' then utilizzo = 'k'; if compress(name) = 'a18' then utilizzo = 'c'; if compress(name) = 'a19' then utilizzo = 'c'; if compress(name) = 'a20' then utilizzo = 'c'; if compress(name) = 'a21b' then utilizzo = 'r'; run; /* We insert particular values for variables */ data spec; length variabile $ 50.; length condizione $ 1000.; length classe $ 20.; variabile = 'a1b'; condizione = 'a1b = 4'; classe = 'Missing'; output; variabile = 'a1b'; condizione = 'a1b = 99'; classe = 'Anomalo'; output; variabile = 'a6b'; condizione = 'a6b = 5'; classe = 'Missing'; output; variabile = 'a6b'; condizione = 'a6b = 99'; classe = 'Anomalo'; output; variabile = 'a7b'; condizione = 'a7b = 99'; classe = 'Anomalo'; output; run; data escc; length var1 $ 50.; length var2 $ 50.; if _n_ < 1; run; /* Dataset with variables and their use */ data new.db_var; set gcd3; run; /* Dataset with particular conditions */ data new.db_cond; set spec; run; /* Dataset of conditional exclusions */ data new.db_esccon; set escc; run; /* We delete temporary files */ proc delete data=gcd3 spec escc; run; /* We start from db_passi file (generated in output of first round) and we create a new file to direct the steps of the second round. */ data new.passi; set old.db_passi; if passo <= 2; run; /* The next step of first round was the insertion of "CL_A2" column in the model. We'll try to insert variable "CL_K_A4" */ proc sql; insert into new.passi values (3, "CL_A1B CL_K_A3 CL_K_A4"); quit; /* N.B.: The variables in this table must not have the original name, but the derivated one, as described in type of variables page */ /* We deassign the old libraries and we assign a new one */ libname old; libname new; libname locale "C:\ ... PATH ... \Data_In2"; /* Now we recall the macro adding in input the parameter "passi" created before */ option mprint symbolgen mlogic; %classizz(in=locale.db, taglio_correlazione=0.4, distribuzione=binomial, alfa=0.05, passo=15, log_0 = "C:\ ... PATH ... \Out_0.log", output_log = "C:\ ... PATH ... \Out_1.log", max_giri=0, max_format=$5000, simpson=1, passi=locale.passi); option nomprint nosymbolgen nomlogic; |
|
|
Main index | Programs index | Autoreg index |
Vai alla versione Italiana |
Creation date: 17 Sep 2010
Translation date: 30 Dec 2012
Last change: 20 Apr 2013
Translation reviewed by
Giulia Di Lallo