2017年3月9日 星期四

SAS macro Ensemble Learning




以bootstrap方法產生多個資料集,
分別為每個資料集建立linear regression模型,
整合多個模型降低mse。

作者:jimmy
日期:2017/3/9
%macro my_bootstrap(data = , b = , obs =);
%do bootstrap_i = 1 %to &b.;
 proc surveyselect data= &data. outall method=urs n=&obs. noprint
                out=BOOTSTRAP_out;
 run;

 data BOOTSTRAP_out&bootstrap_i;
  set BOOTSTRAP_out;
  do hit_i = 1 to NumberHits;
   output;
  end;
 run;
%end;
%mend;
/*proc reg data = mysas.red1;
 model quality = Sulphates pH  volatile_acidity Alcohol / vif all;
quit;*/

%macro my_reg(data = mysas.red1
        , testdata = mysas.red2
        , y = Quality
        , x = Sulphates pH volatile_acidity Alcohol
        , k = 4
        , alpha = 0.05
        );

%let open_d = %sysfunc (open(&data.)); 
%let obs = %sysfunc(attrn(&open_d., nobs));
/*%let p = %sysfunc(attrn(&open_d., nvars));*/
%let close_d = %sysfunc(close(&open_d.));

%let test_open_d = %sysfunc (open(&testdata.)); 
%let test_obs = %sysfunc(attrn(&test_open_d., nobs));
%let test_close_d = %sysfunc(close(&test_open_d.));

%let p = %sysevalf(&k.+1);

 %do i_name = 1 %to &k.; 
  %let x_name&i_name. =  %scan(&x., &i_name., %str( ));
/*  %let x_name_num&i_name. = %length(x_name&i_name.);*/
 %end;
/*%put x_name1 = &x_name1 x_name2 = &x_name2;*/
/*%put x_name_num1 = &x_name_num1 x_name_num2 = &x_name_num2;*/

proc iml;
 use &data.;
  read all var _all_;
 close &data.;
 y = &y.;
 x0 = {[&obs.] 1};
 %do i = 1 %to &k.; 
   x&i. = &&x_name&i.;
  xc = xc || x&i. - mean(x&i.);
  xall = xall || x&i.;
 %end;

 x_matrix = x0` || xall;

 beta_matrix = inv(x_matrix` * x_matrix) * x_matrix` *y;
 beta_matrix_no0 = beta_matrix[2:&p., ];

 h = x_matrix * inv(x_matrix` * x_matrix) * x_matrix`;
 e = (i(&obs.) - h) * y;

/*ANOVA*/
 SS_res = y`*y - beta_matrix` * x_matrix`*y;
 SS_t = y`*y - (sum(y)##2)/&obs.;
 SS_r = SS_t - SS_res;
 MS_res = SS_res/(&obs.- &p.);
 MS_r = SS_r/(&k.);
 f0 = MS_r / MS_res;
 f0_p_value = 1- probf(f0, &k., &obs.-&k.-1); 

 R_square = 1 - SS_res/SS_t;
 R_square_adj = 1 - (SS_res/(&obs.-&p.))/(SS_t/(&obs.-1));


 XX = inv(x_matrix`*x_matrix);
 se_beta_matrix_ = sqrt(ms_res#diag(xx));

 
/*beta t test ci*/
 %do j = 1 %to &p.;
  t0_&j. = beta_matrix[&j., ]/se_beta_matrix_[&j., &j.];
  if t0_&j. >=0 then t0_P_value_&j. =(1-probt(t0_&j., &obs.-&p.))#2;
   else  t0_P_value_&j. =(probt(t0_&j., &obs.-&p.))#2;

  beta_t0 = beta_t0 // t0_&j.;
  beta_t_P_value = beta_t_P_value // t0_P_value_&j.;

  beta_matrix_ci_a_&j. = beta_matrix[&j., ] - tinv((&alpha.)/2, &obs.-&p.) # se_beta_matrix_[&j., &j.];
  beta_matrix_ci_b_&j. = beta_matrix[&j., ] + tinv((&alpha.)/2, &obs.-&p.) # se_beta_matrix_[&j., &j.];
  beta_matrix_ci_a = beta_matrix_ci_a // beta_matrix_ci_a_&j.;
  beta_matrix_ci_b = beta_matrix_ci_b // beta_matrix_ci_b_&j.;

  se_beta_matrix = se_beta_matrix // se_beta_matrix_[&j., &j.];
 %end;
   y_hat = x_matrix* beta_matrix;



/*residual*/
 %do ii = 1 %to &obs.;
  h_ii_&ii. = h[&ii., &ii.];
  h_ii = h_ii // h_ii_&ii.;
 %end;
 /* e = (i(&obs.) - h) * y;*/
 standardized_residual = e/sqrt(ms_res);
 Studentized_residual = e/sqrt(ms_res#(1 - h_ii));
 s2_i = ((&obs.-&p.) # MS_res - (e##2/(1-h_ii))) / (&obs. - &p. -1);
 R_student = e/sqrt(s2_i#(1-h_ii));
 
/*leverage influence*/
/* h_ii  > 2P/n
 cook_s_d >p/n
 |dffits| >2(p/n)^0.5*/

 cook_s_d = ((Studentized_residual##2)/&p.) # (h_ii/(1-h_ii));
 dffits = sqrt(h_ii/(1-h_ii)) # R_student;


 print SS_r SS_res SS_t ms_r ms_res f0 f0_p_value R_square R_square_adj;
 print beta_matrix se_beta_matrix beta_matrix_ci_a beta_matrix_ci_b beta_t0 beta_t_P_value;
 
 print xx;
 /*print h_ii cook_s_d dffits;
 print e standardized_residual Studentized_residual R_student;

 create output_yr var {y y_hat e}; 
 append;       
 close output_yr;
*/



/*vif  
         Sulphates pH  volatile_acidity Alcohol*/
/* %put x_name1 = &x_name1 x_name2 = &x_name2;*/

%do ai = 1 %to &k.;
 vif_y&ai. = &&x_name&ai.;
 %do vi = 1 %to &k.;                        /*1 = 2 3 4     2 = 1 3 4      3 = 1 2 4       4 = 1 2 3 */
  %if  &ai. ^= &vi. %then %do;
    vif_x&vi. = &&x_name&vi.;
   vif_xc&ai. = vif_xc&ai. || vif_x&vi. - mean(vif_x&vi.);
   vif_xall&ai. = vif_xall&ai. || vif_x&vi.;
  %end;
 %end;


 vif_x_matrix&ai. = x0` || vif_xall&ai.;

 vif_beta_matrix&ai. = inv(vif_x_matrix&ai.` * vif_x_matrix&ai.) * vif_x_matrix&ai.` *vif_y&ai.;

 vif_SS_res&ai. = vif_y&ai.`*vif_y&ai. - vif_beta_matrix&ai.` * vif_x_matrix&ai.`*vif_y&ai.;
 vif_SS_t&ai. = vif_y&ai.`*vif_y&ai. - (sum(vif_y&ai.)##2)/&obs.;

 vif_R_square&ai. = 1 - (vif_SS_res&ai./vif_SS_t&ai.);
 VIF_&&x_name&ai. = 1/(1-vif_R_square&ai.);
 print VIF_&&x_name&ai.;
%end;

/*pred
 new_x = {1 8 275};
 new_y_hat = new_x*beta_matrix;
 new_y_hat_ci_a = new_y_hat - tinv((&alpha.)/2, &obs.-&p.) # sqrt(ms_res#(1+new_x`*(XX)*new_x));
 new_y_hat_ci_b = new_y_hat + tinv((&alpha.)/2, &obs.-&p.) # sqrt(ms_res#(1+new_x`*(XX)*new_x));
*/
 use &testdata.;
  read all var _all_;
 close &testdata.;
 test_y = &y.;
 x0 = {[&test_obs.] 1};
 %do test_i = 1 %to &k.; 
  test_x&test_i. = &&x_name&test_i.;
  test_xall = test_xall || test_x&test_i.;
 %end;
 test_x_matrix = x0`||test_xall;
 test_y_pred = test_x_matrix *beta_matrix;
 create output_test_y var {test_y test_y_pred}; 
 append;       
 close output_test_y;
/* print test_y_pred;*/


quit;

/*proc sgplot data = output_yr;*/
/* scatter x = y_hat y = e;*/
/*run;*/
/**/
/*proc sgplot data = output_yr;*/
/* scatter x = y_hat y = R_student;*/
/*run;*/
/**/
/*proc sgplot data = output_yr;*/
/* scatter x = h_ii y = R_student;*/
/*run;*/
/**/
/*proc univariate data=output_yr noprint;*/
/*   qqplot e;*/
/*run;*/
/**/



%mend;

%my_reg()


%macro testbo(data = mysas.red1, testdata = mysas.red2, b = 500, obs = 999 , testobs= 600);
proc datasets noprint;
 delete all_test_y;
quit;

data all_test_y;
run;

%my_bootstrap(data = mysas.red1 , b = &b., obs = &obs.)
%do bost = 1 %to &b.;
%my_reg(data = BOOTSTRAP_out&bost. 
        , testdata = &testdata.
        , y = Quality
        , x =Sulphates pH  volatile_acidity Alcohol
        , k = 4
        , alpha =0.05)

data output_test_y&bost.;
 set output_test_y;
run; 

data all_test_y;
 set output_test_y&bost. all_test_y;
run;

proc datasets noprint;
 delete BOOTSTRAP_out&bost. output_test_y&bost.;
quit;
%end;

data mean_test_y ;
 set all_test_y;
 %do mean_i = 1 %to &testobs.;
   if mod(_n_, &testobs.) = &mean_i.-1 then TEST_Y_PRED&mean_i. = TEST_Y_PRED;
/*   sum_y_pred&mean_i. +TEST_Y_PRED&mean_i.;*/
/*   mean_y_pred&mean_i. = sum_y_pred&mean_i./&b.-1;*/
 %end;
run;



proc iml;
 use mean_test_y;
  read all var _all_;
 close mean_test_y;
 %do pred_i = 1 %to &testobs.;
  mean_pred&pred_i. = mean(TEST_Y_PRED&pred_i.);
  mean_pred_ = mean_pred_ // mean_pred&pred_i. ;
 %end;
 r_mean_pred_ =mean_pred_[2:&testobs., ]// mean_pred_[1, ];
 create mean_y_pred var {mean_pred_ r_mean_pred_}; 
 append;       
 close mean_y_pred;
quit;

data mean_y_pred_testdata;
 merge &testdata. mean_y_pred;
 drop mean_pred_;
run;
 /* 
data mean_test_y2;
 set mean_test_y;
 if _n_ = (&b.-1) * &testobs.;
 keep mean_y_pred1 - mean_y_pred&testobs.;
run;

proc transpose data=mean_test_y2 out=mean_test_y2;
    var mean_y_pred1 - mean_y_pred&testobs.;
run; 

data y_pred;
 merge &testdata. mean_test_y2;
run;*/

%mend;


%testbo()







0 意見:

張貼留言