2019年7月9日 星期二

SAS MACRO 初步檢查文字變數內是否有亂碼

%macro Garbled(DS=,VAR=,VARn=);
/*===================================================================
作者:JIMMY
最後修改:20190712
DS:輸入要檢查的DATASET
VAR:輸入該DATASET 內要檢查的所有文字變數      | 分隔   如: X | Y
VARn:輸入了多少個VAR
此程式會對DS的VAR進行COMPRESSKEEP英文與數字,然後與COMPRESSKEEP前的DS比對,如壓縮前後不一樣則存到WORK.VAR(變數名稱)
===================================================================*/

%do i = 1 %to  &VARn.; 
    %let VAR&i. = %qscan(%bquote(&VAR.),&i.,%str(|)); 
 data tmp_DS;
  set &DS.;
   &&VAR&i.. = compress(&&VAR&i.. ,'1234567890',"kfs");
    n=_n_;
 run;
 
 data &&VAR&i..  (keep= &&VAR&i..   compressGarbled   n);
  merge &DS.   tmp_DS  (rename=(&&VAR&i.. = compressGarbled));
  if  &&VAR&i..^=compressGarbled;
 run;

 %let open_d = %sysfunc (open( &&VAR&i.. )); 
 %let tmpds = %sysfunc(attrn(&open_d., nobs));
 %let close_d = %sysfunc(close(&open_d.));
 %if &tmpds. < 1 %then %do;
  proc datasets lib=work  mtype=(data view) nolist nowarn;
  delete &&VAR&i.. ;
  run;quit;
 %end;

%end;
proc datasets lib=work  mtype=(data view) nolist nowarn;
delete tmp_DS;
run;quit;

%mend;
%Garbled(DS=test,VAR=X | Y, VARn=2)

例子:
建立一個有亂碼的DATASET
X長度只有1,而DATALINES內有中文的話勢必會變成亂碼
data test ;
length x $ 1.  ;
input X $ ;
Y= 'A';
datalines;
a
b
1

;
run;
%Garbled(DS=test,VAR=X | Y, VARn=2)

執行後,會產生以變數名稱命名的WORK DATASET,如該變數無亂碼則該DATASET不產生(被刪除)。

Read More

2019年7月8日 星期一

SAS DATASET 透過 ODS 產生EXCEL

ods _all_ close;
ods excel file="E:\jimmy\excel.xlsx"   options (sheet_name="sheet1") STYLE= HTMLBLUE ;
proc print data=MST_PREP.ACCOUNT_TRANSACTIONS;
run;
ods excel options(sheet_name="sheet2");
proc print data=MST_PREP.ACCOUNT_TRANSACTIONS;
run;
ods excel close;



紅框處視需求改寫成MACRO變數,就可以自動OUTPUT了。
STYLE= 可參考下圖改變OUTPUT的格式。
ODS OUTPUT 也可參考另外一篇(連結)。


Read More

2019年4月9日 星期二

偵測多個SAS LOG狀態

前半段MACRO來源:识别文件夹下指定文件类型及名称

將所有偵測LOG後的結果存入DB,

目前設定:


一、執行它時,[SAS_JOB_log]會先清空,會抓
"F:\SASWork\SAS\Config\Lev1\SASApp\BatchServer\Logs\DB_LOG\"下的LOG檔名稱包含"AGP.%" "C2C.%" "L2C.%" "LCK.%" '%VA_Report%'  '%WATCH%'LOG,會先抓取今天日期的LOG,如果沒抓到就會抓前一天的LOG,最多到前天。

二、那些LOG要寫入DB
會先判斷ERROR_n1, ERROR_n2, ERROR_n2
n1:該行出現 "ERROR:" 且不出現"MPRINT" '%put ERROR:'
n2: 該行出現了"ERROR: SAS ended due to errors" 或是 "ERROR: Errors        printed on page"
n3:該行出現了NOTE: The SAS System used

三、寫到資料庫[SAS_JOB_log]的欄位[log] ,[job_nm] ,[status] ,[date]
[log]:LOG檔該行的LOG訊息。
[job_nm]:LOG檔的完整名稱。
[date]:執行當下的日期。
[status]:如最下面三張圖
如果該JOB除現ERROR: status就會記錄ABORT, 如果LOG訊息有多行在DB就會記錄多行如圖: (n1-8n2行寫入DB,如果沒有n2就寫入n1上下8)

結果:
AGP.01.battest_07MAR19134632.2190001.log



[SAS_JOB_log]




設定LOG路徑
與需要納入的LOG檔案名稱


建立一個暫存的資料夾存放LOG,因為可能有正在執行中的SAS程式,
其LOG會鎖住,不能讀取。
所以將全部LOG複製新的一份到暫存進行偵測。

最後再把所有暫存的LOG刪除。





 %macro sas_job_log();
%MACRO FIND_SAS(DIRNAME,TYPE);/*参数有两个:路径,文件类型后缀*/
LIBNAME SASJOB "&SASJOB_PATH";
%PUT %STR(----------->DIRNAME=&DIRNAME);
%PUT %STR(----------->TYPE=&TYPE);
DATA SASJOB.DIRFILES; 
RC=FILENAME("DIR","&DIRNAME");/*把&DIRNAME值传给文件引用符“DIR"*/ 
OPENFILE=DOPEN("DIR");/*得到路径标示符OPENFILE,DOPEN是打开directory的sas内置函数*/
IF OPENFILE>0 THEN DO;/*如果OPENFILE>0表示正确打开路径*/ 
NUMMEM=DNUM(OPENFILE);/*得到路径标示符OPENFILE中member的个数nummem*/ 
DO II=1 TO NUMMEM; 
NAME=DREAD(OPENFILE,II);/*用DREAD依次读取每个文件的名字到NAME*/

FILEPATH="&DIRNAME"||NAME;
fnum=COMPRESS(NAME,".xml","");
fnum=COMPRESS(fnum,"user_case","");

OUTPUT;/*依次输出*/
END;
END;
KEEP NAME filepath fnum;/*只保留filepath、fnum列*/
RUN;

PROC SORT DATA=SASJOB.DIRFILES;/*按照NAME排序*/
BY DESCENDING NAME;
%IF &TYPE^=ALL %THEN %DO;/*是否过滤特定的文件类型&TYPE*/ 
WHERE INDEX(UPCASE(NAME),UPCASE(".&TYPE"));/*Y,则通过检索NAME是否包含&TYPE*/
%END;
RUN;

/*PROC PRINT DATA=SASJOB.DIRFILES;*/
/*RUN;*/

%MEND FIND_SAS;

%MACRO DIR(SASJOB_PATH,TYPE,DATASET_NUM);
LIBNAME SASJOB "&SASJOB_PATH";
%LET SASJOB_PATH=&SASJOB_PATH.;
%FIND_SAS(&SASJOB_PATH,&TYPE);

DATA TEMP(KEEP=NAME_SAS NAME_LOG);
SET SASJOB.DIRFILES;
NAME_SAS=COMPRESS(NAME);
NAME_LOG=COMPRESS(TRANWRD(NAME,'.sas','.log'),);
IF NAME_SAS ^='dirfiles.sas7bdat' AND NAME_SAS ^='dirfiles.sas7bdat.lck';



RUN;
PROC SORT DATA=TEMP OUT=SASJOB.DIRFILES;
BY NAME_SAS;
RUN;

data _null_;
 td_p  =  today();
 td_1_p=today()-1;
 td = put(td_p, date7.);
 td_1 = put(td_1_p, date7.);
 call symputx("td",td);
 call symputx("td_1",td_1);
run;

data SASJOB.DIRFILES;
 set SASJOB.DIRFILES;
      x_obs=1;
 if  find(NAME_LOG , "&td.") >0    then do;
  date = input("&td.", date9.);
  Real_job_nm= SUBSTR(NAME_SAS,1, 14);
  output;
 end;
/*    前1天*/
 if  find(NAME_LOG , "&td_1.") >0  then do;
  date = input("&td_1.", date9.);
    Real_job_nm= SUBSTR(NAME_SAS,1, 14);
  output;
 end;
                  
run;

data tmp_X_obs;
 x_obs = 1;
run;
proc append base = SASJOB.DIRFILES   data =tmp_X_obs;
run;

%let open_d = %sysfunc (open(SASJOB.DIRFILES)); 
%let DIRFILES_n = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));

%if  &DIRFILES_n. =1 %then %do;
 data SASJOB.DIRFILES;
  set TEMP;
  where  NAME_LOG  ?  "&td_1."  
                    and  (NAME_LOG like "AGP.%"  or   NAME_LOG like "C2C.%"
         or   NAME_LOG like "L2C.%"  or   NAME_LOG like "LCK.%"   
      or  NAME_LOG  like '%VA_Report%'  or   NAME_LOG like '%WATCH%');
 run;
%end;
 
 
%MEND DIR;
%DIR(D:\SAS\Config\Lev1\SASApp\BatchServer\Logs,log,0);


%x_fcf_twn_ini(log=T,ea=Y);
%fcf_get_runasofdate;

 
proc sql noprint;
 select distinct job_nm 
 into : lag_job_nm  separated by "|" 
 from sasjob.tmp_lag_log
 where status in("NO CONNECTION", "ABORT");

 create table tmp_lag_job as
 select distinct job_nm
 from sasjob.tmp_lag_log
 where status in("NO CONNECTION", "ABORT");
quit;


/*%put &lag_job_nm.;   出ERROR的只要一次*/
%let open_d = %sysfunc (open(tmp_lag_job)); 
%let LAG_JOB_OBS = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));

%do L= 1 %to &LAG_JOB_OBS.;
  %let lag_job_nm&L. = %qscan(%bquote(&lag_job_nm.),&L.,%str(|)); 
 data sasjob.DIRFILES;
  set sasjob.DIRFILES;
  where  NAME_LOG ^= "&&lag_job_nm&L..";
 run;
%end;
 

data tmp_log;
 if 0 then set  seg_kc.SAS_JOB_log;
run;
 
proc datasets library=seg_kc noprint;
 delete SAS_JOB_log ;
run;

data seg_kc.SAS_JOB_log  ;
 if 0 then set tmp_log;
 if  job_nm ^= "";
run;

proc datasets library=work noprint;
 delete tmp_log;
run;





 
proc sql  noprint;
 create table tmp_job_n as
 select NAME_LOG as my_logtime
 from sasjob.DIRFILES
 where (NAME_LOG like "AGP.%"  or   NAME_LOG like "C2C.%"
        or   NAME_LOG like "L2C.%"  or   NAME_LOG like "LCK.%"   
     or  NAME_LOG  like '%VA_Report%'  or   NAME_LOG like '%WATCH%');

 select my_logtime
 into : my_logtime  separated by "\"
 from tmp_job_n;
quit;
%let open_d = %sysfunc (open(work.tmp_job_n)); 
%let job_n = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));




/*%put &my_logtime.;*/


/*%let etls_jobName=LCK.02.SOURCE_CHECK;*/
/*%let t=1;*/
/**/
/*%let etls_jobName = AGP.05.CDD_MAIN_PROCESS;*/
/*%let t=14JAN19143028.3110001;*/
/**/
/**/
/*%let etls_jobName=RERUNAGP;*/
/*%let t=1;*/
/**/
/**/
/*%put &etls_jobName._&t..log;*/
/* */

x 'md "F:\SASWork\SAS\Config\Lev1\SASApp\BatchServer\Logs\DB_LOG\"  ';
x 'xcopy  "D:\SAS\Config\Lev1\SASApp\BatchServer\Logs\*.*"      "F:\SASWork\SAS\Config\Lev1\SASApp\BatchServer\Logs\DB_LOG "/Y'   ;

%do  jon_num = 1  %to  &job_n.;

 %let jon_num&jon_num. = %scan(&my_logtime., &jon_num.,%str(\));

 data SAS_JOB_log_tmp1 ;
  infile "F:\SASWork\SAS\Config\Lev1\SASApp\BatchServer\Logs\DB_LOG\&&jon_num&jon_num.."  dsd missover dlm='09'x;
  length log1 log2 log3 $3000.;
  input log1 $  log2 $;
  retain error_n   error_n2 error_n3;
  if find(log1, "The SAS System          20") =0 ;
  if find(log1, "Create Time=20") =0 ;
  if missing(log1)=0; 
  

        if  find(log1, "ERROR:")  >0  and  find(log1, "MPRINT")=0    and find(log1, '%put ERROR:')=0       then  do ;
        log3 = log1;
           error_n=_n_;
    output;
   end;

         if    find(log1, "ERROR: SAS ended due to errors")>0   or   find(log1, "ERROR: Errors printed on page") >0     then  do ;
        log3 = log1;
           error_n2=_n_;
    output;
   end; 

    if   find(log1, "NOTE: The SAS System used:")>0 then do;
         error_n3=_n_;
      output;
    end;

 run;


%let open_d = %sysfunc (open(work.SAS_JOB_log_tmp1)); 
%let SASLOG_obs = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
%if &SASLOG_obs. > 0 %then %do;
 data _null_;
  set SAS_JOB_log_tmp1;

  if  _n_ =  1 then do ;
    call symputx("error_n",error_n) ;
  end;

  call symputx("error_n2",error_n2);
   call symputx("error_n3",error_n3);
/*    %put  &error_n.  &error_n2.    &error_n3.;*/
 run;
 

 data SAS_JOB_log_TMP2 (  drop=log1);
  infile "F:\SASWork\SAS\Config\Lev1\SASApp\BatchServer\Logs\DB_LOG\&&jon_num&jon_num.."  dsd missover dlm='09'x;
  length log1  log  $3000.   job_nm $80.;
  input log1 $  log $ job_nm$;
  job_nm= "&&jon_num&jon_num..";
  date = input(put(&runasofdate.,8.),yymmdd8.) ;
  if find(log1, "The SAS System          20") =0;
     if find(log1, "Create Time=20") =0 ;
    if missing(log1)=0; 
  format  date yymmdd8.;
/*     if  find(log1, "NOTE: The SAS System used:")  >0   and     find(log1, "ERROR:")  >0    then  do;*/
/*    if  find(log1, 'MPRINT(SVCREG):   put "ERROR:')  =0  or   find(log1, "MPRINT(SVCREG):   put 'ERROR:")  =0 then do;*/
/*     log3="RUNOK"; */
/*     output;*/
/*    end;*/
/*  end;*/
if &error_n2.^=. then do;                  
  if  &error_n2.  >= _n_       >= &error_n. - 8    then do;              
       log = log1;
   if (find(log,"關閉一個現存的連線") >0  or find(log,"無法開啟 SQL Server 連線") >0 )   and
        (find(job_nm, "AGP.01.I")>0     or  find(job_nm, "AGP.03.I")>0     or  find(job_nm, "LCK")>0      or   find(job_nm, "L2C")>0)  
                   then do;
     status = "NO CONNECTION";
     output;
   end; else do;
      status = "ABORT";
      output;
     end;
  end;
end;  else do;
  if  &error_n.+8 >= _n_       >= &error_n. - 8   then do;        
       log = log1;
   if find(log,"關閉一個現存的連線") >0  or find(log,"無法開啟 SQL Server 連線") >0      and
         (find(job_nm, "AGP.01.I")>0     or  find(job_nm, "AGP.03.I")>0     or  find(job_nm, "LCK")>0      or   find(job_nm, "L2C")>0)  
               then do;
     status = "NO CONNECTION";
     output;
   end; else do;       
              status = "ABORT";
               output;
      end;
  end;
end;




 if _n_ = &error_n3.; 
   log = "RUNOK";
   status = "RUNOK";
   output;


run;
 %end; %else  %do;
 data SAS_JOB_log_TMP2 ;
   length job_nm $80.  ;
  log = 'RUNNING';
  status = 'RUNNING';
  job_nm= "&&jon_num&jon_num..";
  date = input(put(&runasofdate.,8.),yymmdd8.) ;

  format  date yymmdd8.;
 run;
 %end;


 %let open_d = %sysfunc (open(work.SAS_JOB_log_TMP2)); 
%let noc = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));


data SAS_JOB_log_TMP2 ;
 set SAS_JOB_log_TMP2;
 XN = _N_;
 do _n_ = 1 to  &noc.;
  if  lag(status) = "NO CONNECTION"   then  status = "NO CONNECTION";
  if  lag(status) = "ABORT"   then  status = "ABORT";
 end;
 if log = "RUNOK" and (status= "ABORT"  or  status = "NO CONNECTION") then delete;
run;


 proc sort data=SAS_JOB_log_TMP2;by descending status  ;run;

data SAS_JOB_log_TMP2 ;
 set SAS_JOB_log_TMP2;
 do _n_ = 1 to  &noc.;
  if  lag(status) = "NO CONNECTION"   then  status = "NO CONNECTION";
  if  lag(status) = "ABORT"   then  status = "ABORT";
 end;
run;
 proc sort data=SAS_JOB_log_TMP2;by   XN  ;run;





 proc append base=sasjob.tmp_lag_log  data=SAS_JOB_log_TMP2  FORCE;
 run;
 proc append base=seg_kc.SAS_JOB_log  data=SAS_JOB_log_TMP2  FORCE;
 run;
%end;

proc sort data=sasjob.tmp_lag_log nodupkey;
by job_nm;
run;
 


x ' rd "F:\SASWork\SAS\Config\Lev1\SASApp\BatchServer\Logs\DB_LOG\" /s /q';
%mend;

%sas_job_log()


Read More

2019年1月12日 星期六

比較兩個 SAS DATA 變數的屬性 MACRO

先將兩個SAS DATA的屬性output到sit_var與uat_var,把欄位屬性變成觀測值。
如下:

data DB1_DS1;
length y $ 13.;
input x y z;
format x ddmmyy.;
datalines;
1 1 1
;
run;

data DB2_DS1;
length y $ 26.;
input x y z;
format x date9.;
datalines;
1 1 1
;
run;

ods trace on / listing;
 proc contents data=DB1_DS1;
ods output Variables= sit_var;
run;
ods trace off;

ods trace on / listing;
 proc contents data=DB2_DS1;
ods output Variables= uat_var;
run;
ods trace off;

在執行 %CK_VALUE 或  proc compare
/*====================================================
DATA1     DB_A 資料集1    work.也要輸入
DATA2    DB_B 資料集2
BY  KEY
ALL_VAR 全部變數名稱 | 分隔
VAR_NUM 幾個變數
JIMMY CHEN
======================================================*/
%macro CK_VALUE(DB1_DATA=,
   DB2_DATA=,
   BY =,
   ALL_VAR = ,
   VAR_NUM=);
%let DATA1 = %scan(&DB1_DATA., 2,%str(.));
%let DATA2 = %scan(&DB2_DATA., 2,%str(.));

proc sort data = &DB1_DATA. out = DB1_REN nodupkey; by &BY.;run;
proc sort data = &DB2_DATA. out = DB2_REN nodupkey; by &BY.;run;

%do d = 1 %to 2;
 %do VAR_N = 1 %to &VAR_NUM.;
  %let VAR&VAR_N. = %scan(&ALL_VAR., &VAR_N.,%str(|));
/*  %put  &&VAR&VAR_N.. ;*/
/*  %put  &&VAR&VAR_N.._&&DATA&D..;*/
  data DB&d._REN;
   length &&VAR&VAR_N.._ck $30.;
   set DB&d._REN(rename =(&&VAR&VAR_N.. =&&VAR&VAR_N.._DB&d.));
  run;
 %end;
%end;                                  

%do VAR_N = 1 %to &VAR_NUM.;
 %let VAR&VAR_N. = %scan(&ALL_VAR., &VAR_N.,%str(|));
 data CK_&&VAR&VAR_N.. (keep =&BY. &&VAR&VAR_N.._ck);
  merge DB1_REN
     DB2_REN;
  by &BY.;
  if  &&VAR&VAR_N.._DB1 =  &&VAR&VAR_N.._DB2 then &&VAR&VAR_N.._ck = "OK";
   else  &&VAR&VAR_N.._ck = "NG";
  if missing(&&VAR&VAR_N.._DB1) then &&VAR&VAR_N.._ck = "NOT FOUND &DB1_DATA.";
  if missing(&&VAR&VAR_N.._DB2) then &&VAR&VAR_N.._ck = "NOT FOUND &DB2_DATA.";
  if missing(&&VAR&VAR_N.._DB1)  and  missing(&&VAR&VAR_N.._DB2)  then &&VAR&VAR_N.._ck = "NOT FOUND"; 

 run;
%end;


data CK_VALUE_ALL;
  merge DB1_REN
      DB2_REN;
  by &BY.;
run;

%do VAR_N = 1 %to &VAR_NUM.;
 %let VAR&VAR_N. = %scan(&ALL_VAR., &VAR_N.,%str(|));
 data  CK_VALUE_ALL;
  merge  CK_VALUE_ALL
     CK_&&VAR&VAR_N..;
    by &BY.;
 run;

 proc freq data = CK_VALUE_ALL;
  title "檢核觀測值: &&VAR&VAR_N..";
  table &&VAR&VAR_N.._ck;
 run;title;

proc print data = Ck_value_all;
 where &&VAR&VAR_N.._ck = "NG";
run;
%end;

%mend;

              
 

%CK_VALUE(DB1_DATA=work.sit_var,
  DB2_DATA=work.uat_var,
  BY=Variable,
  ALL_VAR=Type | Len| Format,
VAR_NUM=3)


結果:






Read More

2018年12月20日 星期四

ERROR: Unable to transcode data to/from UCS-2 encoding.

可試試此MACRO(連結)
或使用KCOUNT() 找中文 檢查長度夠不夠

以下舊文
















append失敗時,可能為以下情況:

1.非NULL欄位,觀測值為NULL。
2.文字欄位長度不夠,產生亂碼。
    (數值欄位也有可能,SAS大多為8,但如append到SQL DB就須注意)
3.亂碼。

不過可以先去檢查程式碼內是否有使用  TEST_VAR =  "內有中文" ,之類的引號內有中文,
要將編碼改成 UTF-8,下圖方式即可修改編碼: 將程式碼另存選擇編碼覆蓋即可!

可使用KCOUNT() 找中文

如是資料原本身就給出亂碼的話,
可試試此MACRO(連結)













以下MACRO為一筆一筆的APPEND,看哪幾筆資料會發生ERROR,再去檢查。
 

%macro Debug_append(base=,
   data= );
%let open_d = %sysfunc (open(&data.)); 
%let  obs = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
 
%do _n = 1  %to &obs.;
 data _data_n&_n.;
  set &data.;
  if _n_ = &_n.;
 run;

 %put ==================================================================;
 %put 第 &_n. 筆觀測值異常;
 %put 1. 非NULL欄位   為  NULL;
 %put 2.文字欄位長度不夠 產生亂碼 ;
 %put 3.亂碼;
 %put ==================================================================;

 %let open_d = %sysfunc (open(_data_n&_n.)); 
 %let  obs2 = %sysfunc(attrn(&open_d., nobs));
 %let close_d = %sysfunc(close(&open_d.));
 %if &obs2. > 0 %then %do;
  proc append base=&base.  data=_data_n&_n.   force;
  run;
 %end;
%end;

%mend;

%Debug_append(base=db_core.FSC_ENTITY_WATCH_LIST_DIM_Jimmy,
   data=dailprep._0304_ETLS_NEWRECORDS22222 )
Read More

2018年12月14日 星期五

SOURCE_CHECK_2


SAS AML 的 SOURCE_CHECK 已經有資料筆數、NULL、UNIQUE與LOOKUP的檢核了

SOURCE_CHECK_2 檢核觀測值的MACRO。

範例:
data DB_A;
input Pty $ score  number;
datalines;
A 100 1
B  90 2
C  80 4
C .  .
D  50 5
F  72 6
G . .
;
run;

data DB_B;
input Pty $ score number;
datalines;
A 100 1
B 90 2
C 82 4
D 51 6
E  59 7
G . .
;
run;

%CK_VALUE(DB1_DATA=work.DB_A,
DB2_DATA=work.DB_B,
BY=Pty,
ALL_VAR=score | number,
VAR_NUM=2)
結果:
NG:兩邊DB,VAR的觀測值不一樣。
NOT FOUND 兩邊MISSING。
OK:一至。











/*====================================================
DATA1     DB_A 資料集1    work.也要輸入
DATA2    DB_B 資料集2
BY  KEY
ALL_VAR 全部變數名稱 | 分隔
VAR_NUM 幾個變數
JIMMY CHEN
======================================================*/
%macro CK_VALUE(DB1_DATA=,
   DB2_DATA=,
   BY =,
   ALL_VAR = ,
   VAR_NUM=);
%let DATA1 = %scan(&DB1_DATA., 2,%str(.));
%let DATA2 = %scan(&DB2_DATA., 2,%str(.));

proc sort data = &DB1_DATA. out = DB1_REN nodupkey; by &BY.;run;
proc sort data = &DB2_DATA. out = DB2_REN nodupkey; by &BY.;run;

%do d = 1 %to 2;
 %do VAR_N = 1 %to &VAR_NUM.;
  %let VAR&VAR_N. = %scan(&ALL_VAR., &VAR_N.,%str(|));
/*  %put  &&VAR&VAR_N.. ;*/
/*  %put  &&VAR&VAR_N.._&&DATA&D..;*/
  data DB&d._REN;
   length &&VAR&VAR_N.._ck $30.;
   set DB&d._REN(rename =(&&VAR&VAR_N.. =&&VAR&VAR_N.._DB&d.));
  run;
 %end;
%end;                                  

%do VAR_N = 1 %to &VAR_NUM.;
 %let VAR&VAR_N. = %scan(&ALL_VAR., &VAR_N.,%str(|));
 data CK_&&VAR&VAR_N.. (keep =&BY. &&VAR&VAR_N.._ck);
  merge DB1_REN
     DB2_REN;
  by &BY.;
  if  &&VAR&VAR_N.._DB1 =  &&VAR&VAR_N.._DB2 then &&VAR&VAR_N.._ck = "OK";
   else  &&VAR&VAR_N.._ck = "NG";
  if missing(&&VAR&VAR_N.._DB1) then &&VAR&VAR_N.._ck = "NOT FOUND &DB1_DATA.";
  if missing(&&VAR&VAR_N.._DB2) then &&VAR&VAR_N.._ck = "NOT FOUND &DB2_DATA.";
  if missing(&&VAR&VAR_N.._DB1)  and  missing(&&VAR&VAR_N.._DB2)  then &&VAR&VAR_N.._ck = "NOT FOUND"; 

 run;
%end;


data CK_VALUE_ALL;
  merge DB1_REN
      DB2_REN;
  by &BY.;
run;

%do VAR_N = 1 %to &VAR_NUM.;
 %let VAR&VAR_N. = %scan(&ALL_VAR., &VAR_N.,%str(|));
 data  CK_VALUE_ALL;
  merge  CK_VALUE_ALL
     CK_&&VAR&VAR_N..;
    by &BY.;
 run;

 proc freq data = CK_VALUE_ALL;
  title "檢核觀測值: &&VAR&VAR_N..";
  table &&VAR&VAR_N.._ck;
 run;title;

proc print data = Ck_value_all;
 where &&VAR&VAR_N.._ck = "NG";
run;
%end;

%mend;

%CK_VALUE(DB1_DATA=amlland.Account_trade_extract,
  DB2_DATA=keepok.account_trade_extract,
  BY=TRANSACTION_REFERENCE_NUMBER,
  ALL_VAR=PRINCIPAL_AMOUNT_IN_BASE_CCY  | ACCOUNT_NUMBER, 
  VAR_NUM=2)

     
              
 
Read More

2018年10月26日 星期五

SAS Macro Meta-Analysis 統合分析


%meta_rr
%meta_or
%meta_rd
%my_meta_norm

%ForestMacro 
%PubBias











%macro pseudo_rsq(log_lh=, log_lh_i=, k=, obs=);
proc iml;
 value = 1-((&log_lh.)/&log_lh_i.);
/* cox_snell_rsq = 1 - (&log_lh_i./&log_lh.)##(2/&obs.);*/
 title "McFadden's  R-Squared";
 print value;
 title;
quit;
%mend;
%macro rsq(data = , p=);
%let open_d = %sysfunc (open(&data.)); 
%let obs = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
proc iml;
 use &data.;
  read all var _all_;
 close  &data.;
 SSres = sum((rri-pred)##2);
/* SSt = sum(rri##2) - &obs.#(mean(rri)##2);*/
 sst = sum((rri-mean(rri))##2);
 ADJ_R_SQ = 1 - ((ssres/(&obs.-&p.)) / (sst/(&obs.-1)));
 print   ADJ_R_SQ;
quit;
%mend;



%macro meta_rr(data = , ai = , bi = , ci = , di = ,references=);
data new_data dorp_data;
 set &data.;
 if &ai. = 0 or &bi. = 0 or &ci. = 0 or &di. = 0 then do;
  output dorp_data;
  delete;
 end;

 if ai = bi and ci = di then do;
  output dorp_data;
  delete;
 end;
 output new_data;
run;

proc iml;
 use new_data;
  read all var _all_;
 close new_data;   

%let open_d = %sysfunc (open(new_data)); 
%let obs_ = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
%let obs_1 = %sysevalf(&obs_ - 1);
 n1 = &ai.+&ci.;
 n2 = &bi.+&di.;
 m1 = &ai.+&bi.;
 m2 = &ci.+&di.;
 N_total = &ai. +&bi.+&ci.+&di.; 
               /*99% 2.576*/
 eer = &ai. / n1;           /*98% 2.326*/
 cer = &bi. / n2;
 nnt = 1/abs(eer-cer);
 nnt_ci_l = 1/((eer-cer)+1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnt_ci_u = 1/((eer-cer)-1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnh = 1/abs(eer-cer);
 nnh_ci_l = 1/((eer-cer)+1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnh_ci_u = 1/((eer-cer)-1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));

/*RR*/
 pti = &ai./(&ai.+&ci.);
 pci = &bi./(&bi.+&di.);
 rri = pti/pci;
 ln_rri = log(rri);
 var_ln_rri = (1-pti)/(n1#pti) + (1-pci)/(n2#pci);
 wi = 1/var_ln_rri;
 rr_pooled = exp(sum(wi#log(rri))/sum(wi));
 se_ln_rr_pooled = sqrt(1/(sum(wi)));
 se_ln_rr=((1-pti)/(&ai.+&ci.)/pti+(1-pci)/(&bi.+&di.)/pci)##0.5;
 rri_ci_l = exp(log(rri)-(1.96#se_ln_rr));
 rri_ci_u = exp(log(rri)+(1.96#se_ln_rr));
 rr_pooled_ci_l = exp(log(rr_pooled)-(1.96#se_ln_rr_pooled));
 rr_pooled_ci_u = exp(log(rr_pooled)+(1.96#se_ln_rr_pooled));
 print rri rri_ci_l rri_ci_u rr_pooled rr_pooled_ci_l rr_pooled_ci_u se_ln_rr_pooled ; 

 Zrr = log(rr_pooled)/se_ln_rr_pooled; /*ho: rr=1      reject h0 if |zrr|>=1.96*/
/* zrr_pv2 = 1 - cdf("Normal", Zrr); */
 zrr_pv = (1-probnorm(abs(Zrr)))#2;
 print zrr zrr_pv;

 Qrr = sum(wi#((log(rri)-log(rr_pooled))##2));/*ho: rr1=rr2....=rr  reject h0 qrr_pv>qrr_pv_005*/
/* qrr_pv_005 = cinv(0.95, 8);*/
 qrr_pv = (1-probchi(Qrr, &obs_1.));
 print Qrr  qrr_pv;

 i2 = (Qrr - &obs_1.)/ qrr;
 print i2;

/* Random Effect */
 t_2 =  max(0, (Qrr-(&obs_1.))   /  (sum(wi)- (sum(wi##2)) /sum(wi))) ;
 wi_ = 1/(var_ln_rri+t_2);
 theta_ = sum(wi_#rri) / sum(wi_);
 se_theta_ = 1 / sqrt(sum(wi_));
 theta__ci_l = theta_ - 1.96 # se_theta_;
 theta__ci_u = theta_ + 1.96 # se_theta_;

 r_pooled_rr = exp(sum(wi_#log(rri))/sum(wi_));
 se_r_pooled_rr = 1 / sqrt(sum(wi_));
 r_pooled_rr_ci_l = r_pooled_rr - 1.96#se_r_pooled_rr;
 r_pooled_rr_ci_u = r_pooled_rr + 1.96#se_r_pooled_rr;

 print t_2 wi_ theta_ se_theta_ theta__ci_l theta__ci_u ;
 print r_pooled_rr se_r_pooled_rr r_pooled_rr_ci_l r_pooled_rr_ci_u;

/*theta_的有效性檢驗,    h0 : theta_ =0         */
 utheta_ = ((sum(wi_ # rri))##2  ) / sum(wi_);
 utheta__pv = (1-probchi(utheta_, 1));

 zutheta_ = theta_ / se_theta_;
 zutheta__pv = (1-probnorm(abs(zutheta_)))#2;
  print utheta_ utheta__pv  zutheta_ zutheta__pv;


 median_ln_rri = median(ln_rri);
 wi_pct = wi/sum(wi);
 sum_w = sum(wi_pct);
 
 create output var {rri rri_ci_l rri_ci_u rr_pooled rr_pooled_ci_l rr_pooled_ci_u se_ln_rr  zrr Qrr 
 qrr_pv ln_rri wi  n1 n2 median_ln_rri i2 wi_pct wi_}; 
  append;       
 close output;
 GroupId =3;
  create pooled var {rr_pooled rr_pooled_ci_l rr_pooled_ci_u GroupId}; 
  append;       
 close output;
 create ran_pooled var {r_pooled_rr  r_pooled_rr_ci_l r_pooled_rr_ci_u GroupId}; 
  append;       
 close output;
quit;title;

data output2;
 merge output new_data;
 obs = _n_;
 GroupId = 1;
run;

data output3;
 set output2    
   pooled (rename=(RR_POOLED=rri   rr_pooled_ci_l=rri_ci_l rr_pooled_ci_u=rri_ci_u))
   ran_pooled (rename=(r_pooled_rr=rri   r_pooled_rr_ci_l=rri_ci_l   r_pooled_rr_ci_u=rri_ci_u));
 if _n_ = &obs_.+1 then &references. = "Pooled_F";
 if _n_ = &obs_.+2 then &references. = "Pooled_R";
run;


proc sgplot data = output2;
 HIGHLOW y=obs HIGH=rri_ci_u LOW=rri_ci_l / close = rri ;
 REFLINE 1 / label = "1" AXIS= X;
  yaxis REVERSE;
quit;
proc print data = output2(keep = &references. rri rri_ci_u rri_ci_l);
run;
/*
proc sgplot data = output2;
 STEP x = obs y= wi / DATALABEL = wi;
quit;
*/
proc sgplot data = output2;
 scatter y = se_ln_rr x = ln_rri /  datalabel = obs;
  yaxis REVERSE;
 refline median_ln_rri / axis = x;
quit;
%mend;


%macro meta_or(data = , ai = , bi = , ci = , di = ,references=);

data new_data dorp_data;
 set &data.;
 if &ai. = 0 or &bi. = 0 or &ci. = 0 or &di. = 0 then do;
  output dorp_data;
  delete;
 end;
  if ai = bi and ci = di then do;
  output dorp_data;
  delete;
 end;
 output new_data;
run;

proc iml;
 use new_data;
  read all var _all_;
 close new_data;   

%let open_d = %sysfunc (open(new_data)); 
%let obs_ = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
%let obs_1 = %sysevalf(&obs_ - 1);
 n1 = &ai.+&ci.;
 n2 = &bi.+&di.;
 m1 = &ai.+&bi.;
 m2 = &ci.+&di.;
 N_total = &ai. +&bi.+&ci.+&di.; 
               /*99% 2.576*/
 eer = &ai. / n1;           /*98% 2.326*/
 cer = &bi. / n2;
 nnt = 1/abs(eer-cer);
 nnt_ci_l = 1/((eer-cer)+1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnt_ci_u = 1/((eer-cer)-1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnh = 1/abs(eer-cer);
 nnh_ci_l = 1/((eer-cer)+1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnh_ci_u = 1/((eer-cer)-1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));


/*OR*/
 oddsti = &ai./&ci.;
 oddsci = &bi./&di.;
 ori = oddsti/oddsci;
 /*Woolf's Logit estimate of OR*/
 var_ln_ori = 1/&ai. + 1/&bi. + 1/&ci. + 1/&di.;
 wi = 1 / var_ln_ori;
 ln_orw = sum(wi#log(ori))/sum(wi);
 orw = exp(ln_orw);
 ln_ori = log(ori);
 se_ln_or = sqrt(var_ln_ori);
 ln_ori_ci_l = ln_ori - 1.96#sqrt(var_ln_ori);
 ln_ori_ci_u = ln_ori + 1.96#sqrt(var_ln_ori);
 ori_ci_l = exp(ln_ori_ci_l);
 ori_ci_u = exp(ln_ori_ci_u);
 orw_ci_l = exp(ln_orw-1.96#sqrt(1/sum(wi)));
 orw_ci_u = exp(ln_orw+1.96#sqrt(1/sum(wi)));
 print ori ori_ci_l ori_ci_u  ln_ori ln_ori_ci_l ln_ori_ci_u orw orw_ci_l orw_ci_u;
 /* orw顯著性 h0:ORw=OR0 (e.g. or0=1)*/
 or0=1;
 Uw = sum(wi)#((ln_orw-log(or0))##2);
 Uw_pv = (1-probchi(Uw, 1));
 print Uw Uw_pv;
 /*ori同質性檢定 H0: or1=or2=...=or*/
 Qw = sum(wi#(log(ori)##2)) - sum(wi)#(ln_orw##2);
 Qw_pv = (1-probchi(Qw, &obs_1.));
 print Qw Qw_pv;
 i2 = (qw - &obs_1. )/ qw;
 print i2;
/* Random Effect */
 t_2 =  max(0, (Qw-(&obs_1.))   /  (sum(wi)- (sum(wi##2)) /sum(wi))) ;
 wi_ = 1/(var_ln_ori+t_2);
 theta_ = sum(wi_#ori) / sum(wi_);
 se_theta_ = 1 / sqrt(sum(wi_));
 theta__ci_l = theta_ - 1.96 # se_theta_;
 theta__ci_u = theta_ + 1.96 # se_theta_;

 r_pooled_or = exp(sum(wi_#log(ori))/sum(wi_));
 se_r_pooled_or = 1 / sqrt(sum(wi_));
 r_pooled_or_ci_l = r_pooled_or - 1.96#se_r_pooled_or;
 r_pooled_or_ci_u = r_pooled_or + 1.96#se_r_pooled_or;

 print t_2 wi_ theta_ se_theta_ theta__ci_l theta__ci_u ;
 print r_pooled_or se_r_pooled_or r_pooled_or_ci_l r_pooled_or_ci_u;

/*theta_的有效性檢驗,    h0 : theta_ =0         */
 utheta_ = ((sum(wi_ # ori))##2  ) / sum(wi_);
 utheta__pv = (1-probchi(utheta_, 1));

 zutheta_ = theta_ / se_theta_;
 zutheta__pv = (1-probnorm(abs(zutheta_)))#2;
  print utheta_ utheta__pv  zutheta_ zutheta__pv;
 median_ln_ori = median(ln_ori);
  wi_pct = wi/sum(wi);
 create output var { i2
       ori ori_ci_l ori_ci_u  ln_ori ln_ori_ci_l ln_ori_ci_u orw orw_ci_l orw_ci_u se_ln_or 
Uw Uw_pv Qw
       Qw_pv wi median_ln_ori n1 n2 wi_pct wi_}; 
  append;       
 close output;
 GroupId=3;
  create pooled var {orw orw_ci_l orw_ci_u GroupId}; 
  append;       
 close output;
  create ran_pooled var {r_pooled_or  r_pooled_or_ci_l r_pooled_or_ci_u GroupId}; 
  append;       
 close output;
quit;title;

data output2;
 merge output new_data;
 obs = _n_;
 GroupId = 1;
run;

data output3;
 set output2    
   pooled (rename=(orw=ori   orw_ci_l=ori_ci_l orw_ci_u=ori_ci_u))
   ran_pooled (rename=(r_pooled_or=ori   r_pooled_or_ci_l=ori_ci_l   r_pooled_or_ci_u=ori_ci_u));
 if _n_ = &obs_.+1 then &references. = "Pooled_F";
 if _n_ = &obs_.+2 then &references. = "Pooled_R";
run;



proc sgplot data = output2;
 HIGHLOW y=obs HIGH=ori_ci_u LOW=ori_ci_l / close = ori;
 REFLINE 1 / label = "1" AXIS= X;
  yaxis REVERSE;
quit;
proc print data = output2(keep = &references. ori ori_ci_u ori_ci_l);
run;
/*
proc sgplot data = output2;
 STEP x = obs y= wi_or / DATALABEL = wi_or;
quit;
*/
proc sgplot data = output2;
 scatter y = se_ln_or x = ln_ori /  datalabel = obs;
 yaxis REVERSE;
 refline median_ln_ori / axis = x; 
quit;
%mend;



%macro meta_rd(data = , ai = , bi = , ci = , di = ,references=);

data new_data dorp_data;
 set &data.;

 if ai = 0 and bi = 0 then do;
  output dorp_data;
  delete;
 end;

 if ai = bi and ci = di then do;
  output dorp_data;
  delete;
 end;

 output new_data;
run;

proc iml;
 use new_data;
  read all var _all_;
 close new_data;   

%let open_d = %sysfunc (open(new_data)); 
%let obs_ = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
%let obs_1 = %sysevalf(&obs_ - 1);
 n1 = &ai.+&ci.;
 n2 = &bi.+&di.;
 m1 = &ai.+&bi.;
 m2 = &ci.+&di.;
 N_total = &ai. +&bi.+&ci.+&di.; 
               /*99% 2.576*/
 eer = &ai. / n1;           /*98% 2.326*/
 cer = &bi. / n2;
 nnt = 1/abs(eer-cer);
 nnt_ci_l = 1/((eer-cer)+1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnt_ci_u = 1/((eer-cer)-1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnh = 1/abs(eer-cer);
 nnh_ci_l = 1/((eer-cer)+1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 nnh_ci_u = 1/((eer-cer)-1.96*sqrt((eer#(1-eer)/n1)+(cer#(1-cer)/n2)));
 

/*RD*/
 pti = &ai./(&ai.+&ci.);
 pci = &bi./(&bi.+&di.);
 RDi = pti - pci;
 var_rdi = (pti#(1-pti))/n1 + (pci#(1-pci))/n2;
 se_rd = sqrt(var_rdi);
 wi = 1/var_rdi;
 RD_pooled = sum(wi#rdi) / sum(wi);
 var_rd_p =  1/sum(wi);
 se_rd_p = 1/sqrt(sum(wi));
 RDi_ci_l = RDi - 1.96#sqrt(var_rdi);
 RDi_ci_u = RDi + 1.96#sqrt(var_rdi);
 RDp_ci_l = RD_pooled - 1.96#se_rd_p;
 RDp_ci_u = RD_pooled + 1.96#se_rd_p;
 print rdi RDi_ci_l RDi_ci_u RD_pooled RDp_ci_l RDp_ci_u;
 /* rd顯著性 h0:rd=0  if |zrd|>1.96 reject ho*/
 Zrd = RD_pooled/se_rd_p;
 zrd_pv = (1-probnorm(abs(Zrd)))#2;
 print zrd zrd_pv;
 /* rd同質性 h0:rd1=rd2...*/
 Qrd = sum(wi#((RDi-RD_pooled)##2));
 Qrd_pv = (1-probchi(Qrd, &obs_1.));
 print Qrd Qrd_pv;
 i2 = (qrd - &obs_1.) / qrd;
 print i2;
/* Random Effect */
 t_2 =  max(0, (Qrd-(&obs_1.))   /  (sum(wi)- (sum(wi##2)) /sum(wi))) ;
 wi_ = 1/(var_rdi+t_2);
 theta_ = sum(wi#rdi) / sum(rdi);
 se_theta_ = 1 / sqrt(sum(wi_));
 theta__ci_l = theta_ - 1.96 # se_theta_;
 theta__ci_u = theta_ + 1.96 # se_theta_;

 r_pooled_rd = exp(sum(wi_#rdi)/sum(wi_));
 se_r_pooled_rd = 1 / sqrt(sum(wi_));
 r_pooled_rd_ci_l = r_pooled_rd - 1.96#se_r_pooled_rd;
 r_pooled_rd_ci_u = r_pooled_rd + 1.96#se_r_pooled_rd;

 print t_2 wi_ theta_ se_theta_ theta__ci_l theta__ci_u ;
 print r_pooled_rd se_r_pooled_rd r_pooled_rd_ci_l r_pooled_rd_ci_u;

/*theta_的有效性檢驗,    h0 : theta_ =0         */
 utheta_ = ((sum(wi_ # rdi))##2  ) / sum(wi_);
 utheta__pv = (1-probchi(utheta_, 1));

 zutheta_ = theta_ / se_theta_;
 zutheta__pv = (1-probnorm(abs(zutheta_)))#2;
  print utheta_ utheta__pv  zutheta_ zutheta__pv;

 median_rdi = median(rdi);
  wi_pct = wi/sum(wi);
 create output var {
      i2  Qw_pv rdi RDi_ci_l RDi_ci_u RD_pooled RDp_ci_l RDp_ci_u se_rd zrd  Qrd Qrd_pv
 wi median_rdi n1 n2 wi_pct}; 
  append;       
 close output;
 GroupId=3;
  create pooled var {RD_pooled RDp_ci_l RDp_ci_u GroupId}; 
  append;       
 close output;
  create ran_pooled var { r_pooled_rd  r_pooled_rd_ci_l r_pooled_rd_ci_u GroupId}; 
  append;       
 close output;



quit;title;

data output2;
 merge output new_data;
 obs = _n_;
 GroupId = 1;
run;

data output3;
 set output2    
   pooled (rename=(RD_pooled=rdi   RDp_ci_l=rdi_ci_l RDp_ci_u=rdi_ci_u))
   ran_pooled (rename=(r_pooled_rd=rdi   r_pooled_rd_ci_l=rdi_ci_l   r_pooled_rd_ci_u=rdi_ci_u));
 if _n_ = &obs_.+1 then &references. = "Pooled_F";
 if _n_ = &obs_.+2 then &references. = "Pooled_R";
run;


proc sgplot data = output2;
 HIGHLOW y=obs HIGH=rdi_ci_u LOW=rdi_ci_l / close = rdi;
 REFLINE 0 / label = "0" AXIS= X;
  yaxis REVERSE;
quit;
proc print data = output2(keep = &references. rdi rdi_ci_u rdi_ci_l);
run;
/*
proc sgplot data = output2;
 STEP x = obs y= wi / DATALABEL = wi;
quit;
*/
proc sgplot data = output2;
 scatter y = se_rd x = rdi  / datalabel = obs;
 yaxis REVERSE;
 refline median_rdi / axis = x;
quit;


%mend;

%macro my_meta_norm(data = , NT = , YT = , ST = , NC = , YC = , SC = , references=);
%let open_d = %sysfunc (open(&data.)); 
%let obs_ = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));
%let obs_1 = %sysevalf(&obs_ - 1);
proc iml;
 use &data.;
  read all var _all_;
 close &data.; 
/* si = sqrt(st##2+sc##2-2*0.7*st*sc) /sqrt(2*(1-0.7)); */
/* si = sqrt(((nt-1)#((st)##2) + (nc-1)#((sc)##2))/(nt+nc-2));*/
/* esi = (yt - yc)/si;*/
/* proc iml;
 yt = {54 48 40};
 yc = {71 72 75};
 st={14 15 11};
 sc = {6 8 8};
 n = {36 39 37};
 nt = {36 39 37};
 nc = {36 39 37};
 si = sqrt(st##2+sc##2-2#0.7#st#sc) /sqrt(2#(1-0.7));
 esi = (yt - yc)/si;
 se_esi = sqrt((1/n)+(esi##2)/(2#n)) # sqrt(2#(1-0.7));
  create d1 var {yt yc st sc n si esi se_esi nt nc}; 
  append;       
 close d1;

quit;*/
/* m = nt+nc -2;*/
/* m = nt -1;*/
/* cm = 1 - (3/(4#m-1));*/
/* esui = cm#esi;*/
/* var_esui = (nt+nc)/(nt#nc)  +  ((esui)##2)/(2#(nt+nc));*/
/* se_esui = sqrt(var_esui);*/
 esui_ci_l = esui - 1.96 #se_esui;
 esui_ci_u = esui + 1.96 #se_esui;
 print si esi esui var_esui esui_ci_l esui_ci_u;






 wi = 1/var_esui;
 pooled_es = sum(wi#esui) / sum(wi);
 se_pooled_es = 1 / sqrt(sum(wi));


 pooled_es_ci_l = pooled_es - 1.96#se_pooled_es;
 pooled_es_ci_u = pooled_es + 1.96#se_pooled_es;
 
 print wi pooled_es se_pooled_es pooled_es_ci_l pooled_es_ci_u;
/* print wi2 pooled_es2 se_pooled_es2;*/

/*ES的顯著性檢驗, */
/*      i.e. Testing H0: Effect Size = 0*/

 ues = ((sum(wi#esui))##2) / sum(wi);
 ues_pv = (1-probchi(ues, 1));
 print ues ues_pv;


median_esui = median(esui);
/*ES的同質性檢驗, */
/*      i.e. Testing H0: = = = */
 qes = sum(wi# ((esui-pooled_es)##2));
 qes_pv = (1-probchi(qes, &obs_1.));
 print qes qes_pv;

 i2 = (qes -&obs_1.)/qes;
 print i2;
  wi_pct = wi/sum(wi);



/* Random Effect Model*/
 t_2 =  max(0, (qes-(&obs_1.))   /  (sum(wi)- (sum(wi##2)) /sum(wi))) ;
 wi_ = 1/(var_esui+t_2);
 wi__ = (wi_/sum(wi_) )*100;
 theta_ = sum(wi_#esui) / sum(wi_);
 se_theta_ = 1 / sqrt(sum(wi_));
 theta__ci_l = theta_ - 1.96 # se_theta_;
 theta__ci_u = theta_ + 1.96 # se_theta_;

 r_pooled_es = sum(wi_#esui) / sum(wi_);
 se_r_pooled_es = 1 / sqrt(sum(wi_));
 r_pooled_es_ci_l = pooled_es - 1.96#se_r_pooled_es;
 r_pooled_es_ci_u = pooled_es + 1.96#se_r_pooled_es;

 print t_2 wi_ wi__ theta_ se_theta_ theta__ci_l theta__ci_u ;
 print r_pooled_es se_r_pooled_es r_pooled_es_ci_l r_pooled_es_ci_u;
/*r ES的顯著性檢驗, */
/*      i.e. Testing H0: Effect Size = 0*/

 ues = ((sum(wi_#esui))##2) / sum(wi);
 ues_pv = (1-probchi(ues, 1));
 print ues ues_pv;



/*r ES的同質性檢驗, */
/*      i.e. Testing H0: = = = */
 qes = sum(wi_# ((esui-pooled_es)##2));
 qes_pv = (1-probchi(qes, &obs_1.));
 print qes qes_pv;

 i2 = (qes -&obs_1.)/qes;
 print i2;
  wi_pct = wi/sum(wi);
/*theta_的有效性檢驗,    h0 : theta_ =0           */
 utheta_ = ((sum(wi_ # esui))##2  ) / sum(wi_);
 utheta__pv = (1-probchi(utheta_, 1));

 zutheta_ = theta_ / se_theta_;
 zutheta__pv = (1-probnorm(abs(zutheta_)))#2;
  print utheta_ utheta__pv  zutheta_ zutheta__pv;

 GroupId=3;
  create pooled var {pooled_es  pooled_es_ci_l pooled_es_ci_u GroupId}; 
  append;       
 close pooled;
  create ran_pooled var {r_pooled_es  r_pooled_es_ci_l r_pooled_es_ci_u GroupId}; 
  append;       
 close ran_pooled;


;

  create output var { i2 si esi esui var_esui se_esui esui_ci_l esui_ci_u  pooled_es se_pooled_es pooled_es_ci_l pooled_es_ci_u ues ues_pv  qes qes_pv
          median_esui wi_ wi__  r_pooled_es se_r_pooled_es r_pooled_es_ci_l r_pooled_es_ci_u }; 
  append;       
 close output;


quit;

data output2;
 merge output (rename=(esui=smd esui_CI_L=lcl esui_CI_u=ucl WI__ = weigth_pct))                        &data.;
 obs = _n_;
 GroupId = 1;
  smd_ = Round(smd, 0.01);
run;/*
data output3;
 set output2    
   pooled (rename=(pooled_es=esui   pooled_es_ci_l=esui_ci_l pooled_es_ci_u=esui_ci_u))
   ran_pooled (rename=(r_pooled_es=esui   r_pooled_es_ci_l=esui_ci_l   r_pooled_es_ci_u=esui_ci_u));
 if _n_ = &obs_.+1 then &references. = "Pooled_F";
 if _n_ = &obs_.+2 then &references. = "Pooled_R";
run;
*/
proc sgplot data = output2;
 HIGHLOW y=obs HIGH=esui_ci_u LOW=esui_ci_l / close = esui ;
 REFLINE 0 / label = "0" AXIS= X;
  yaxis REVERSE;
quit;
proc print data = output2(keep = &references. esui esui_ci_l esui_ci_u wi);
run;

proc sgplot data = output2;
 scatter y = se_esui x = esui / datalabel = obs;
 refline median_esui / axis = x; 
 yaxis REVERSE;
quit;


%mend;

%macro symsize;
data _null_;
 set metadat;
  retain sizeh1-sizeh%eval(&n_stud+2)
  fontv1-fontv%eval(&n_stud+2); 
  length fontv1-fontv%eval(&n_stud+2) $20.; 
  array sizes sizeh1-sizeh%eval(&n_stud+2); 
  array fvs $ fontv1-fontv%eval(&n_stud+2); 
  do i=1 to (&n_stud)+2; 
  if i=y then do; 
  sizes{i}=size*11/&maxsize; 
  fvs{i}='font=specialu v=K'; 
  if i=&n_stud+2 then output; 
  end;
 end;
%do i=1 %to (&n_stud)+2;
 call symput("sh&i",trim(left(put(sizeh&i,6.2))));
 call symput("fv&i",trim(left(put(fontv&i,20.))));
 %global sh&i fv&i;
%end;
run; 
%mend symsize;
%macro doall (study);
%do %while (&study le (&n_stud+2));
 if y=&study then do;
 xx&study=lower95;
 yy&study=y+0.2; output;
 yy&study=y-0.2; output;
 yy&study=y;
 output;
 xx&study=upper95; output;
 yy&study=y+0.2;output;
 yy&study=y-0.2; output;
 end;
%let study=%eval(&study+1);
%end;
%mend doall; 
%macro syms;
%do i=1 %to (&n_stud)+2;
 symbol&i &&fv&i l=1 interpol=none h=&&sh&i color=black;
%end;
%mend syms;
%macro plotpts;
%do i=1 %to (&n_stud + 2);
 yy&i*xx&i=%eval(&n_stud+3)
%end;
%do i=1 %to (&n_stud + 2);
 y&i*x&i=&i
%end;
%mend plotpts;
%Macro PubBias(N1=size1,N2=size2,di=effsize,studyID=study,ModelType=1,Dataset=MetaPub);
proc iml; 
use &Dataset;
 read all var{&N1} into n1_vec;
 read all var{&N2} into n2_vec;
 read all var{&di} into di_vec;
 read all var{&n1 &n2} into n_vec;
 k= nrow(di_vec); 
 start mean_d(di_vec,var_di,tau2,d_mean,resum_wt,vi_star,d_SE,upper95,lower95);
 k = nrow(di_vec);
 d_mean = 0;
 resum_wt = 0;
 vi_star = J(k,1,0);
 do i = 1 to k;
 d_mean = d_mean + di_vec[i,1]/(var_di[i,1]+tau2);
 resum_wt = resum_wt + (var_di[i,1]+tau2)##-1;
 vi_star[i,1] = var_di[i,1]+tau2;
 end;
 d_mean = d_mean/resum_wt;
 d_SE = SQRT(resum_wt##-1);
 upper95 = d_mean + 1.96#d_SE;
 lower95 = d_mean - 1.96#d_SE;
finish; 
start calcq(di_vec,n_vec,qq,prob_qq1,var_di);
 k = nrow(di_vec);
 var_di=J(k,1,0);
 do i = 1 to k;
 var_di[i,1] = ((n_vec[i,1]+n_vec[i,2])/(n_vec[i,1]#n_vec[i,2])) +
((di_vec[i,1]##2)/(2#(n_vec[i,1]+n_vec[i,2])));
 end;
 d_plus = 0;
 fesum_wt = 0;
 do i = 1 to k;
 d_plus = d_plus + di_vec[i,1]/var_di[i,1];
 fesum_wt = fesum_wt + var_di[i,1]##-1;
 end;
 d_plus = d_plus/fesum_wt;
 QQ = 0;
 do i = 1 to k;
 QQ = QQ + ((di_vec[i,1] - d_plus)##2/var_di[i,1]);
 end;
 prob_qq1 = 1 - PROBCHI(QQ,k-1);
finish; 
start calcreg(di_vec,n_vec,X_Matrix,vi,B_wls,SE_B,B_ols,SE_B_ols);
 k = nrow(di_vec);
 X = J(k,1,1)||X_Matrix;
 B_wls = INV(X`*DIAG(vi)*X)*X`*DIAG(vi)*di_vec;
 cov_b = INV(X`*DIAG(vi)*X);
 SE_B = SQRT(vecdiag(cov_b));
 B_ols =INV(X`*X)*X`*di_vec;
 cov_b = INV(X`*X);
 SE_B_ols = SQRT(vecdiag(cov_b));
finish; 
START KENDALL(A,B,N,T_AB,untie_A,untie_B,VART_AB,Z_TEST);
 DOM_MTXA = J(N,N,0);
 ties_A = 0;
 counts_A = 0;
 do i = 1 to N;
 do j = 1 to N;
 if A[i,1] > A[j,1] then do;
 DOM_MTXA[i,j] = 1;
 end;
 if A[i,1] < A[j,1] then do;
 DOM_MTXA[i,j] = -1;
 end;
 if A[i,1] = A[j,1] then do;
 ties_A = ties_A + 1;
 end;
 counts_A = counts_A + 1; 
 end;
 end;
 untie_A = 1 - (ties_A - N)/(counts_A - N);
 DOM_MTXB = J(N,N,0);
 ties_B = 0;
 counts_B = 0;
 do i = 1 to N;
 do j = 1 to N;
 if B[i,1] > B[j,1] then do;
 DOM_MTXB[i,j] = 1;
 end;
 if B[i,1] < B[j,1] then do;
 DOM_MTXB[i,j] = -1;
 end;
 if B[i,1] = B[j,1] then do;
 ties_B = ties_B + 1;
 end;
 counts_B = counts_B + 1;
 end;
 end;
 untie_B = 1 - (ties_B - N)/(counts_B - N);
 DOM_MTXD = J(N,N,0);
 do i = 1 to N;
 do j = 1 to N;
 DOM_MTXD[i,j] = DOM_MTXA[i,j]#DOM_MTXB[i,j];
 end;
 end;
 MTXD_sum = DOM_MTXD[,+];
 T_AB = MTXD_sum[+,] #(1/(n#(n-1)));
MTX_F = J(N,1,0);
do i = 1 to N;
 MTX_F[i,1] = (MTXD_sum [i,1]/(n-1));
 end;
MTX_G = J(N,1,0);
 do i = 1 to N;
 MTX_G[i,1] = (MTX_F[i,1] - T_AB)##2;
 end;
 MTXG_sum = 1/(n-1)#(MTX_G[+,]);
MTX_H = J(N,N,0);
 do i = 1 to N;
 do j = 1 to N;
 MTX_H[i,j] = DOM_MTXD[i,j]#DOM_MTXD[i,j];
 end;
 end;
 MTXH_sum = MTX_H[+,+];
 NUMER_AB = MTXH_sum - ((n) # (n-1) # (T_AB#T_AB));
 DENOM_AB = n#(n-1) -1;
 VART_AB = NUMER_AB/DENOM_AB; 
  vart_ab = ((4#(n-2)#(MTXG_sum))+(2#VART_AB)) / (n#(n-1));
IF vart_ab > 0 THEN DO ;
 Z_TEST = (T_AB /SQRT(vart_ab));
end;
if vart_ab =0 then do;
 Z_TEST = 5.00;
END;
FINISH; 
 run calcq(di_vec,n_vec,qq,prob_qq1,var_di);
 CC = (J(1,K,1)*var_di##-1) - ((J(1,K,1)*var_di##-2) /
 (J(1,K,1)*var_di##-1));
 Tau2 = (QQ - (K - 1)) / cc;
 if tau2 < 0 then tau2 = 0;
 if &modeltype=0 then tau2=0; 
 run mean_d(di_vec,var_di,tau2,d_mean,resum_wt,vi_star,d_SE,upper95,lower95);
 vi = J(k,1,0);
 vi_inv = j(k,1,0);
 mean = 0;
 weight = 0;
 t_star = J(k,1,0);
 Vi_stand = J(k,1,0);
 dev_di = J(k,1,0);
 Root_Vi=J(k,1,0);
 Egger_z = J(k,1,0);
 REJ_TRIM = J(3,1,0);
 do i = 1 to k;
 vi[i,1] = vi_star[i,1];
 vi_inv[i,1] = vi[i,1]##-1;
 mean = d_mean;
 weight = resum_wt;
 dev_di[i,1] = ABS(di_vec[i,1] - mean);
 Vi_stand[i,1] = vi[i,1] - (weight##-1);
 t_star[i,1] = (di_vec[i,1] - mean)/SQRT(Vi_stand[i,1]);
 Root_Vi[i,1] = vi[i,1]##-0.5;
 Egger_z[i,1] = di_vec[i,1]#Root_Vi[i,1];
 end;
 run calcreg(Egger_z,n_vec,Root_Vi,vi_inv,B_wls,SE_B,B_ols,SE_B_ols);
 eggt_ols = B_ols[1,1]/SE_B_ols[1,1]; 
 eggPR_tOLS =2#(1-probt(abs(eggt_ols),k-2)); 
  run KENDALL(t_star,vi,k,T_X1Y,UT_A,UT_B,VART_X1Y,Z_TEST);
 BeggV_z = Z_TEST;
 BeggVpr_z =2#(1-probnorm(abs(Z_test)));
  total_n = n_vec * J(2,1,1);
 run KENDALL(t_star,total_n,k,T_X1Y,UT_A,UT_B,VART_X1Y,Z_TEST);
 BeggN_z = Z_TEST;
 BeggNpr_z =2#(1-probnorm(abs(Z_test))); 
  run calcreg(di_vec,n_vec,total_n,vi_inv,B_wls,SE_B,B_ols,SE_B_ols);
 Funt_WLS = B_wls[2,1]/SE_B[2,1];
 FunPR_tWLS =2#(1-probt(abs(Funt_WLS),k-2)); 
  dev_rank = rank(dev_di);
 do i = 1 to k;
 if di_vec[i,1] < mean then dev_rank[i,1] = -1#dev_rank[i,1];
 end;
 r=-1#MIN(dev_rank);
 gamma=k-r;
 ro=gamma-1;
 r2=MAX(dev_rank);
 gamma2=k-r2;
 ro2=gamma2-1;
 if ro > 3 then REJ_TRIM[1,1] = REJ_TRIM[1,1] +1;
 if ro2 > 3 then REJ_TRIM[2,1] = REJ_TRIM[2,1] +1; 
 if (ro > 3 | ro2 > 3) then REJ_TRIM[3,1] = REJ_TRIM[3,1] +1;
 Right=REJ_TRIM[1,1]; If Right=1 then Right='Yes'; Else Right='No';
 Left=REJ_TRIM[2,1];If Left=1 then Left='Yes'; Else Left='No';
 Both=REJ_TRIM[3,1];If Both=1 then Both='Yes'; Else Both='No';
file print;
put
@1'-------------------------------------------------------------------------------------'//
@1 'Meta-Analysis: Descriptive Information'/
@1 '--------------------------------------'//
@5'Number of Studies'@60 k//
@5'Mean Effect Size' @60 d_mean 10.4/
@7'Confidence Band'/
@10 '95% Lower Limit' @60lower95 10.4/
@10'95% Upper Limit'@60 upper95 10.4//
@5 'Test for Homogeneity'@60'Chi Square'@75'Probability'/
@5 '--------------------' @60'----------'@75'-----------'/  
@5'Q test'@62 QQ 8.4 @75 prob_qq1 10.4//
@5'Random Effects Variance Component'@62 Tau2 8.4///
@1'--------------------------------------------------------------------------------------'//
@1 'Meta-Analysis: Tests of Publication Bias'/
@1 '----------------------------------------'//
@5 'Egger Regression' @60't value'@75'Probability'/
@5 '----------------' @60'-------'@75'-----------'/
@5 'Egger OLS Regression' @60 eggt_OLS 7.4 @75 eggPR_tOLS 10.4///
@5 'Begg Rank Correlation' @60'z value'@75'Probability'/
@5 '---------------------' @60'-------'@75'-----------'/
@5 'Begg Rank Correlation (Predictor=Variance)' @60 beggV_z 7.4 @75 beggVpr_z 10.4/
@5 'Begg Rank Correlation (Predictor=Sample Size)' @60 beggN_z 7.4 @75 beggNpr_z 10.4///
@5 'Funnel Plot Regression' @60't value'@75'Probability'/
@5 '----------------------' @60'-------'@75'-----------'/
@5 'Funnel Plot WLS Regression' @60 Funt_WLS 7.4 @75 FunPR_tWLS 10.4///
@5 'Trim and Fill'@60'Publication Bias Present'/
@5 '-------------'@60'------------------------'/
@7 'Right Tail' @70 Right/
@7 'Left Tail' @70 Left/
@7 'Both Tails' @70 Both/
@1'--------------------------------------------------------------------------------------'//;
 sum_n = n_vec*J(2,1,1);
 total_n = sum(sum_n);
 outvector = d_mean||lower95||upper95||total_n;
 create j1 from outvector;
 append from outvector;
quit;
data meta2;
 set &Dataset;
 SE = SQRT((&n1 + &n2) / (&n1 * &n2) + &di**2 / (2*(&n1 + &n2)));
 lower95 = &di - 1.96*SE;
 upper95 = &di + 1.96*SE;
 size = &n1 + &n2;
 y = _n_;
data total1;
 set j1;
 rename col1 = &di col2 = lower95 col3 = upper95 col4 = size;
 length &StudyID $ 12;
 y = 99;
 &StudyID = 'Total';
data total2;
 y = 98;
data total;
 set total1 total2;
 axis1 label=(height=2.9
 font='Zapf' 'Effect Size')
 minor=none
 value=(height=2.5 font='Zapf');
 axis2 label=(height=2.9 angle = 90
 font='Zapf' 'Sample Size')
 minor=none
 value=(height=2.5 font='Zapf');
 title1 font = 'Zapf' height = 2.9 'Funnel Plot' ;
 symbol1 interpol=none
 value=circle  
 height=3
 cv=black
 ci=black
 co=black
 width=2;
/*proc gplot data=meta2;*/
/* plot size*&di /haxis=axis1 vaxis=axis2 frame ;*/
/*run;*/
proc means data=meta2 noprint;
 var size;
 output out=meanout sum=sum max=max;
data _null_;
set meanout end=eof;
if eof then do;
 call symput("n_stud",trim(left(put(_freq_,8.))));
 call symput("n_subs",trim(left(put(sum,8.))));
 call symput("maxsize",trim(left(put(max,8.))));
end;
proc sort data=meta2; by &di upper95;
run;
data metadat;
set total(in=a) meta2(in=b);
 length metastr $ 200;
 retain metastr studcnt;
 metanum=y;
 y=_n_;
 if _n_=1 then do;
 metastr="0=' ' %eval(&n_stud+3)=' '";
 studcnt=&n_stud+2;
 end;
 else studcnt=studcnt-1;
 do i=1 to %eval(&n_stud+2);
 if i=studcnt then metastr=" " || trim(left(metastr)) || ' ' || trim(left(y))
||"='" || trim(left(&StudyID)) || "' ";
 end;
 if &StudyID=' ' then y=.;
 yo=lower95; xo=upper95;
 output metadat;
 if _n_=%eval(&n_stud+2) then do;
 call symput("metastr",trim(metastr));
 end;
run;
proc format;
 value stfmt &metastr;
run;
data alldata;
 set metadat;
 %doall (1)
 %symsize
 %syms
data final(drop=i);
set alldata;
 array xarray{*} x1-x%eval(&n_stud+2);
 array yarray{*} y1-y%eval(&n_stud+2); 
 do i=1 to (&n_stud)+2;
 if i=y then do;
 xarray{i}=&di;
 yarray{i}=y;
 end;
 end;
format yy1-yy%eval(&n_stud+2) stfmt.;
run;
 axis1 label=(height=2.9
 font='Zapf' 'Effect Size')
 minor=none
 value=(height=2.5 font='Zapf');
 axis2 label=(height=2.9 angle = 90
 font='Zapf' 'Study')
 minor=none
 order=( 0 to %eval(&n_stud+3) by 1)
 value=(height=2.5 font='Zapf');
 /*title1 font = 'Zapf' height = 2.9 'Forest Plot' ;
proc gplot data=final;
plot
%plotpts / overlay haxis=axis1 vaxis=axis2 frame href=0;
symbol%eval(&n_stud+3) f=marker v=none l=1 w=1 i=join;
symbol1 font=marker v=P l=1 h=2.7 interpol=none;
run;*/
title;
quit;
%Mend PubBias; 


%macro ForestMacro (
       Data=,         /*--Data Set Name (Required)--*/
       Study=,          /*--Variable name for Study (Required)--*/ 
       OddsRatio=,      /*--Variable name for Odds Ratio (Required)--*/ 
       LCL=,            /*--Variable name for Lower Confidence Limit (Required)--*/ 
       UCL=,            /*--Variable name for Upper Confidence Limit (Required)--*/ 
       Group=GroupId,          /*--Variable name for Study Type--*/ 
       Weight=,         /*--Variable name for Study Weight in %--*/
    StatCol1=,       /*--Variable name for Stat Column 1--*/
    StatCol2=,       /*--Variable name for Stat Column 2--*/
    StatCol3=,       /*--Variable name for Stat Column 3--*/
    StatCol4=,       /*--Variable name for Stat Column 4--*/
    DisplayCols=YES, /*--Display the columns for OR, LCL, UCL & Weight--*/
    WtFactor=,       /*--Multiplier factor for Study Weights--*/
                        /*--If not provided WtFactor is computed internally--*/
    Bands=YES,       /*--Draw Horizontal Alternating Bands--*/
    Borders=NO,      /*--Draw Borders--*/
    GraphWalls=NO,   /*--Draw Filled Walls behind the Graph--*/
    StatWalls=NO,    /*--Draw Filled Walls behind the Statistics Tables--*/
    Width=6.4in,     /*--Default width of the graph in pixels--*/
    Height=,         /*--Default height of the graph is computed based on number of observations--*/
    LabelColWidth=0.2,                /*--Fractional width for Label Column--*/
    Label1=Favors Treatment,          /*--Favorable Label--*/
    Label2=Favors Control,            /*--Unfavorable Label--*/
    PlotTitle=Odds Ratio and 95% CL,  /*--Plot Title--*/
    FootNote=,                        /*--Graph Footnote--*/
    Title2=,                          /*--Graph title2--*/
    Title=Impact of Treatment on Mortality by Study  /*--Graph Title--*/ 
);

%local  WeightVar MarkerSize GraphColWidth StatColWidth Border DisplaySecondary GraphWallDisplay StatWallDisplay;
%local  OddsLabel LowerLabel UpperLabel WeightLabel SLabel1 SLabel2 SLabel3 SLabel4;
%local  GraphHeight Ratio RowHeight HeaderHeight Nobs;

/*--Data, Study, OddsRatio, LCL and UCL are required   --*/
/*--Group is optional                                  --*/
/*--Terminatethese required parameters are not supplied--*/
%if %length(&Data) eq 0 %then %do;
%put The parameter 'Data' is required - Forest Macro Terminated.;
%goto finished;
%end;
%else %if %length(&Study) eq 0 %then %do;
%put The parameter 'Study' is required - Forest Macro Terminated.;
%goto finished;
%end;
%else %if %length(&LCL) eq 0 %then %do;
%put The parameter 'LCL' is required - Forest Macro Terminated.;
%goto finished;
%end;
%else %if %length(&UCL) eq 0 %then %do;
%put The parameter 'UCL' is required - Forest Macro Terminated.;
%goto finished;
%end;
%else %if %length(&OddsRatio) eq 0 %then %do;
%put The parameter 'OddsRatio' is required - Forest Macro Terminated.;
%goto finished;
%end;

/*--Initialize GraphHeight, Height per row and Height for other graph items--*/
%let GraphHeight=&Height;
%let RowHeight=22;
%let HeaderHeight=100;
%if %length(&Footnote) ne 0 %then %do;
  %let HeaderHeight=115;
%end;

/*--If the Weight column is not provided, use equal weights, and suppress display of Weight stat--*/
%if &Weight eq %then %do;
  %let WeightVar = _Weight;
  %let MarkerSize = 7;
%end; 
%else %do;
  %let WeightVar=&Weight;
  %let MarkerSize = 0;
%end;

/*--Set up GTL options for borders--*/
%let DisplaySecondary = displaysecondary=none;
%let Borders=%upcase(&Borders);
%if &Borders eq YES or &Borders eq Y %then %do;
  %let Border = line;
  %let DisplaySecondary = displaysecondary=(line);
%end;

/*--Set up GTL options for GraphWall Display--*/
%let GraphWallDisplay = walldisplay=none;
%let GraphWalls=%upcase(&GraphWalls);
%if &GraphWalls eq YES or &GraphWalls eq Y %then %do;
  %let GraphWallDisplay = walldisplay=(fill);
%end; 

/*--Set up GTL options for StatWall Display--*/
%let StatWallDisplay = walldisplay=none;
%let StatWalls=%upcase(&StatWalls);
%if &StatWalls eq YES or &StatWalls eq Y %then %do;
  %let StatWallDisplay = walldisplay=(fill);
%end;

/*--Create Label Columns for standard and additional columns--*/

/*--Load Stat Column Label or name into macro for label column value--*/
%let dsid=%sysfunc(open(&Data));
%if &dsid %then %do;
    
    %let Nobs=%sysfunc(attrn(&dsid, nlobs));
 %if &Nobs eq 0 %then %do;
      %put The Data Set &Data has no observations - Forest Macro Terminated.;
      %let rc=%sysfunc(close(&dsid)); 
      %goto finished;
    %end;

    %if &Nobs gt 150 %then %do;
      %put The Data Set &Data has over 100 observations - Forest Macro Terminated.;
      %let rc=%sysfunc(close(&dsid)); 
      %goto finished;
    %end;

    /*--Count the number of stat columns--*/
    %let idx=0;
 
 /*--Column display information for the OddsRatio column--*/
 %let DisplayCols=%upcase(&DisplayCols);

    %if &DisplayCols eq YES or &DisplayCols eq Y %then %do;
      %let OddsLabel=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&OddsRatio))));
     %if %length(&OddsLabel) eq 0 %then %let OddsLabel=&OddsRatio;
   %let idx= %eval(&idx+1);

      %let LowerLabel=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&LCL))));
     %if %length(&LowerLabel) eq 0 %then %let LowerLabel=&LCL;
   %let idx= %eval(&idx+1);

      %let UpperLabel=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&UCL))));
     %if %length(&UpperLabel) eq 0 %then %let UpperLabel=&UCL;
   %let idx= %eval(&idx+1);

      %if &Weight ne %then %do;
        %let WeightLabel=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&Weight))));
       %if %length(&WeightLabel) eq 0 %then %let WeightLabel=&Weight;
     %let idx= %eval(&idx+1);
   %end;
    %end;

 /*--Additional columns to be displayed--*/
    %if %length(&StatCol1) ne 0 %then %do;
      %let SLabel1=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&StatCol1))));
     %if %length(&SLabel1) eq 0 %then %let SLabel1=&StatCol1;
   %let idx= %eval(&idx+1);
    %end;

    %if %length(&StatCol2) ne 0 %then %do;
      %let SLabel2=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&StatCol2))));
     %if %length(&SLabel2) eq 0 %then %let SLabel2=&StatCol2;
   %let idx= %eval(&idx+1);
    %end;

    %if %length(&StatCol3) ne 0 %then %do;
      %let SLabel3=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&StatCol3))));
     %if %length(&SLabel3) eq 0 %then %let SLabel3=&StatCol3;
   %let idx= %eval(&idx+1);
    %end;

    %if %length(&StatCol4) ne 0 %then %do;
      %let SLabel4=%sysfunc(varlabel(&dsid, %sysfunc(varnum(&dsid,&StatCol4))));
     %if %length(&SLabel4) eq 0 %then %let SLabel4=&StatCol4;
   %let idx= %eval(&idx+1);
    %end;

    %let rc=%sysfunc(close(&dsid)); 

 /*--Set column weights based on number of stat columns--*/
    %let StatColWidth=%sysevalf(&idx * 0.075);
    %let GraphColWidth= %sysevalf(1.0 - &LabelColWidth - &StatColWidth);
%end;
%else %do;
    %put The data set &Data does not exist - Forest Macro Terminated.;
    %goto finished;
%end;

/*--Compute Weight Factor if not provided   --*/
/*--Estimate height of graph if not provided--*/
data _null_;
  set &Data end=last;
  retain totalweight 0;
  totalweight+wt;

  if last then do;
    %if &wtFactor eq %then %do;
      if totalweight <= 0 then totalweight=1;
      call symput ('wtFactor', 1 / totalweight);
    %end;
 /*--Estimate Ratio of Plot height by Graph Height--*/
 call symput ('Ratio', (_N_* &RowHeight)/(_N_* &RowHeight + &HeaderHeight));

    /*--Estimate the optimal height of the graph based on obs count--*/
    %if &Height eq %then %do;
   call symput ('GraphHeight', _N_ * &RowHeight + &HeaderHeight);
 %end;
  end;
run;

/*--Append a PX only if this internally estimated--*/
%if &Height eq %then %do;
%let GraphHeight=&GraphHeight.px;
%end;

/*--Process Data--.3*/
data _forest;
  set &Data;
  format _wt  ;

  _ObsId=_N_;

  %if &Weight eq %then %do;
    &WeightVar=0;
  %end;

  label _wt=&WeightLabel;

  /*--If Group column is provided--*/
  %if %length(&Group) ne 0 %then %do;
    /*--Group=1 (Study) values will be drawn without a group role--*/
    if &group=1 then do;
   _wt=&WeightVar / 100;
   _grp=10;
      _1 = &OddsRatio;
      _lcl1=&LCL; 
      _ucl1=&UCL;
      /*--Compute marker width--*/
      _x1=&OddsRatio / (10 ** (&WeightVar*&WtFactor/2));
      _x2=&OddsRatio * (10 ** (&WeightVar*&WtFactor/2));
    end;
  /*--Group=2 & 3 (SubGroup and Overall) values will be drawn with groupindex=2 & 3--*/
    else if &group > 1 then do;
      _grp=&group;
      _2 = &OddsRatio;
    end;
  %end;
  %else %do;
    _wt=&WeightVar / 100;
    _grp=10;
    _1 = &OddsRatio;
    _lcl1=&LCL; 
    _ucl1=&UCL;
    /*--Compute marker width--*/
    _x1=&OddsRatio / (10 ** (&WeightVar*&WtFactor/2));
    _x2=&OddsRatio * (10 ** (&WeightVar*&WtFactor/2));
  %end;

  /*--Create label columns for standard and additional statistic--*/
  %if %length(&OddsRatio) ne 0 %then %do;
    _OddsRatioLabel = symget('OddsLabel');
  %end;

  %if %length(&LCL) ne 0 %then %do;
    _LowerLabel = symget('LowerLabel');
  %end;

  %if %length(&UCL) ne 0 %then %do;
    _UpperLabel = symget('UpperLabel');
  %end;

  %if %length(&Weight) ne 0 %then %do;
    _WeightLabel = symget('WeightLabel');
  %end;

  %if %length(&StatCol1) ne 0 %then %do;
    _StatColLabel1 = symget('SLabel1');
 _StatCol1 = &StatCol1;
  %end;

  %if %length(&StatCol2) ne 0 %then %do;
    _StatColLabel2 = symget('SLabel2');
 _StatCol2 = &StatCol2;
  %end;

  %if %length(&StatCol3) ne 0 %then %do;
    _StatColLabel3 = symget('SLabel3');
 _StatCol3 = &StatCol3;
  %end;

  %if %length(&StatCol4) ne 0 %then %do;
    _StatColLabel4 = symget('SLabel4');
 _StatCol4 = &StatCol4;
  %end;
 
  run;

/*--Reverse the order to avoid putting axis reverse--*/
proc sort data=_forest out=_forest;
  by descending _ObsId;
  run;

/*--Add sequence numbers to each observation--*/
data _forest;
  set _forest;
  studyvalue=_n_;
run;

/*--Output values and formatted strings to data set--*/
data _forestFormat;
  set _forest end=last;
  keep label start end fmtname type hlo;
  retain fmtname '_Study' type 'n';
  label=&Study;
  start=studyvalue;
  end=studyvalue;
  output;
  if last then do;
    hlo='O';
 label='Other';
 output;
  end;
  run;

/*--Create Format from data set--*/
proc format library=work cntlin=_forestFormat;
  run;

/*--Apply format to study values--*/
/*--Compute width of box proportional to weight in log scale--*/
data _forest;
  format studyvalue _study.;
  set _forest;
  %let Bands=%upcase(&Bands);
  %if &Bands eq YES or &Bands eq Y %then %do;
    if mod(studyvalue, 2) = 0 then _StudyRef=StudyValue;
  %end;
  run;

/*--Compute top and bottom offsets--*/
data _null_;
  pct=&Ratio/nobs;
  thk=pct* 0.9 *100;
  call symputx("pct", pct);
  call symputx("pct2", 2*pct);
  call symputx("RefThickness", thk);
  call symputx("count", nobs);
  set _forest nobs=nobs;
run;

/*title;*/
/*options nodate nonumber;*/

/*--Define GTL template for graph--*/
proc template;
  define statgraph ForestMacro;
    begingraph / designwidth=&Width designheight=&GraphHeight;
   entrytitle "&Title";
   entryfootnote halign=left "&FootNote";
   %if %length(&title2) ne 0 %then %do;
     entrytitle "&title2" / textattrs=graphLabelText;
      %end;
   layout lattice / columns=3 columnweights=(&LabelColWidth &GraphColWidth &StatColWidth) columngutter=0
                       rowdatarange=union;
        /*--Column # 1 contains the Study Labels using Secondary Y axis--*/
        layout overlay / walldisplay=none x2axisopts=(display=none)
                         yaxisopts=(linearopts=(tickvaluesequence=(start=1 end=&count increment=1))
                                    offsetmin=&pct2 offsetmax=&pct display=none
                                    displaysecondary=(tickvalues &border));
    scatterplot y=studyvalue x=_1 / yaxis=Y xaxis=X2 markerattrs=(size=0) includemissinggroup=true;
    scatterplot y=studyvalue x=_1 / yaxis=Y xaxis=X2 markerattrs=(size=0) includemissinggroup=true;
  endlayout;
     /*--Column # 2 contains the graph--*/
        layout overlay / &GraphWallDisplay border=false
                       
                         yaxisopts=(linearopts=(tickvaluesequence=(start=1 end=&count increment=1))
                                    offsetmin=&pct2 offsetmax=&pct display=none);

    /*--Draw alternating bands using referenceline--*/
          %if &Bands eq YES or &Bands eq Y %then %do;
            referenceline y=_StudyRef / lineattrs=(thickness=&RefThickness.PCT) datatransparency=0.9;
    %end;

          /*--Draw Markers for SubGroup and Overall values--*/
    %if %length(&Group) ne 0 %then %do;
            scatterplot y=studyvalue x=_2 / markerattrs=(symbol=diamondfilled size=10) group=_grp 
                      includemissinggroup=true index=_grp;
    %end;
          /*--Draw OddsRatio and Limits for Study Values--*/
          scatterplot y=studyvalue x=_1 / xerrorupper=_ucl1 xerrorlower=_lcl1 
                  markerattrs=graphdata1(symbol=squarefilled size=&MarkerSize);

    /*--Draw box representing the weight of the study--*/
          vectorplot y=studyvalue x=_x2 xorigin=_x1 yorigin=studyvalue / lineattrs=GraphData1(thickness=8) 
                 arrowheads=false;

          /*--Draw Reference lines and labels--*/
      
    referenceline x=0;

    entry halign=left  "&Label1" / valign=bottom;
    entry halign=right "&Label2" / valign=bottom;
  endlayout;

     /*--Column # 2 contains the statistics data--*/
        layout overlay / &StatWallDisplay border=false
                         x2axisopts=(display=(tickvalues &border) displaysecondary=(line))
                         yaxisopts=(linearopts=(tickvaluesequence=(start=1 end=&count increment=1))
                                    offsetmin=&pct2 offsetmax=&pct 
                                    display=none &DisplaySecondary.);
          /*--Draw alternating bands using referenceline--*/
          %if &Bands eq YES %then %do;
            referenceline y=_StudyRef / lineattrs=(thickness=&RefThickness.PCT) datatransparency=0.9;
    %end;

          /*--Draw standard statistics columns--*/
          %if &DisplayCols eq YES or &DisplayCols eq Y %then %do;
            scatterplot y=studyvalue x=_OddsRatioLabel / markercharacter=&OddsRatio xaxis=x2;
   scatterplot y=studyvalue x=_LowerLabel / markercharacter=&LCL xaxis=x2;
   scatterplot y=studyvalue x=_UpperLabel / markercharacter=&UCL xaxis=x2;
   %if &Weight ne %then %do;
              scatterplot y=studyvalue x=_WeightLabel / markercharacter=_wt xaxis=x2;
   %end;
    %end;

    /*--Draw additional statistics columns--*/
          %if %length(&StatCol1) ne 0 %then %do;
            scatterplot y=studyvalue x=_StatColLabel1 / markercharacter=&StatCol1 xaxis=x2;
    %end;

    %if %length(&StatCol2) ne 0 %then %do;
            scatterplot y=studyvalue x=_StatColLabel2 / markercharacter=&StatCol2 xaxis=x2;
    %end;

          %if %length(&StatCol3) ne 0 %then %do;
            scatterplot y=studyvalue x=_StatColLabel3 / markercharacter=&StatCol3 xaxis=x2;
    %end;

          %if %length(&StatCol4) ne 0 %then %do;
            scatterplot y=studyvalue x=_StatColLabel4 / markercharacter=&StatCol4 xaxis=x2;
    %end;

  endlayout;
   endlayout;
 endgraph;
  end;
run;

proc sgrender data=_forest template=ForestMacro description='Forest Plot';
  run;

%finished:

title;
FootNote;
%mend ForestMacro;











Read More

2017年10月27日 星期五

分組

%macro my_cn2(data =, y =, x =, c_x =, l_r=, z =);

ods trace on / listing; 
proc contents data =  &data.;
 ods output Variables = contents_;
run;
ods trace off;

data contents_;
 set contents_;
 where upcase(Variable) = "%upcase(&y.)";
 call symputx("type", type);
run;

%if "&type." = "Num" %then %do;
 data temp_data;
  set &data.;
  y_ = put(&y. ,z&z..);
 run;
%end; %else %do;
  data temp_data;
   set &data.;
   y_ = &y.;
  run;
%end;


proc sql;
 create table class_ as
  select  distinct  y_
  from temp_data;
quit;

%let data_class = class_;
%let open_d = %sysfunc (open(&data_class.));
%let row_n = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));

proc sql noprint;
 select  y_
  into :y1 - :y&row_n.
  from class_;
quit;

data _null_;
 comb_ = comb(&row_n.,2);
 call symputx("comb_", comb_);
run;

proc iml;
 allcomb_ = allcomb(&row_n., 2);
 create allcomb var {allcomb_}; 
 append;       
 close allcomb;
quit;

%let data__ = allcomb;
%let open_d = %sysfunc (open(&data__.));
%let row_nn = %sysfunc(attrn(&open_d., nobs));
%let close_d = %sysfunc(close(&open_d.));

proc sql noprint;
 select  allcomb_
  into :r2n_1 - :r2n_&row_nn.
  from allcomb;
quit;



%do comb_i = 1 %to &row_nn. %by 2;
%let comb_i_ = %sysevalf(&comb_i. +1);
%let comb__ = 1;/*66*/
/*%put &comb_i. &comb_i_.;*/
/*data comb_&comb__.;*/
data _&&&&y&&r2n_&comb_i...vs_&&&&y&&r2n_&comb_i_...;
 set temp_data;
  where y_= "&&&&y&&r2n_&comb_i..." or y_ = "&&&&y&&r2n_&comb_i_...";
run;


 %if "&l_r." = "t" %then %do; 
  proc logistic data = _&&&&y&&r2n_&comb_i...vs_&&&&y&&r2n_&comb_i_... plots=all;
   class &c_x.;
   model y_ = &x. &c_x.;
   output out = o__&&&&y&&r2n_&comb_i...vs_&&&&y&&r2n_&comb_i_... PREDICTED= p PREDPROBS= p2;
  quit;
 %end;
 %let comb__ = %sysevalf(&comb__. +1);/*66*/
%end;

%mend;
%my_cn2(data = mysas.unk, y = final, z=1)


data 資料集
y 類別變數
z 類別變數為數值型態時且類別總類小於10 z=1 小於100 z=2 ....  文字形態時可不輸入


4類別分組後結果

Read More

2017年7月4日 星期二

sas macro logistic regression real adaboost

data comb_data;
length comb_x $ 100;
input comb_x $ @@;
/*輸入所有x變數名稱,方便隨機抽取,放入不同模型裡*/
datalines;
area da hfs i0 pa500
;
run;

%macro logistic_real_adabost(data = ,
y = ,
        y_testing=,
x =  ,
ab = ,
training_obs=,
class_x= ,
n_x=);
/*ab為要有幾個模型*/
/*n_x為模型內最多有幾個變數*/
data ck_pv_out1;
set &data.;
w1 = 1/&training_obs.;
run;



%do lrab = 1 %to &ab.;
%let lrab_1 = %sysevalf(&lrab. + 1);


data _null_;
u=rand('uniform');   
max=&n_x.;   
var_n=ceil(max*u);
call symputx("var_n", var_n);
run;
%put &var_n.;
proc surveyselect data= comb_data  n=&var_n. noprint out=comb_out;
run;

proc sql noprint;
select comb_x
into :comb_xx  separated by " "
from comb_out;
quit;

%put &comb_xx.;



proc logistic data = ck_pv_out&lrab. desc noprint;
class      &class_x.    ;
model &y. = &comb_xx.;
output out = logistic_out
PREDICTED= p&lrab.;
weight w&lrab.;
run;





data ck_pv;
set logistic_out;
where &y. ^=.;
if p >0.5 then p_h = 1;
else if p < 0.5 then p_h = 0;
else p_h =.;
if &y. ^= p_h then error = 1;
else error = 0;
run;

proc iml;
use ck_pv;
read all var _all_;
close ck_pv;



fm = (1/2) # log((p&lrab.)/(1-p&lrab.));

temp = exp((-&y.)#fm);

w_&lrab_1. = w&lrab. # temp;

w&lrab_1. = w_&lrab_1./sum(w_&lrab_1.);

g_x&lrab. = fm;

create ck_pv_out&lrab_1. var {&y. &x. w&lrab_1.};
append;     
close ck_pv_out&lrab_1.;
create ck_pv_out_gx&lrab.  var {g_x&lrab.};
append;     
close ck_pv_out_gx&lrab.;
quit;

%end;


data all;
merge  &data.
ck_pv_out_gx1-ck_pv_out_gx&ab.;
x = sum(of g_x1-g_x&ab.);
if x >0 then tf = 1;
else tf = -1;
run;


data training testing;
set all;
if Selected =0 then output training;
else output testing;
run;

proc freq data = training;
table &y. * tf / nocol nopercent ;
title "整體學習 結果 training";
run;title;

proc freq data = testing;
table &y_testing. * tf/ nocol nopercent ;
title "整體學習 結果 testing";
run;title;

%do delete_ = 1 %to &ab.;
proc datasets noprint;
delete ck_pv_out&delete_.  ck_pv_out_gx&delete_. ;
quit;
%end;
%mend;
/*
以real_adabost
%logistic_real_adabost(data = mysas.breast_2,
y = Classcar,
                                                        y_testing=Classcar_test,
x = area da hfs i0 pa500 p max_ip a_da,
ab = 10,
training_obs=55,
class_x= ,
n_x=4)

=================================================


以逐步迴歸
proc logistic data = mysas.breast_2 noprint;
model Classcar =  area da hfs i0 pa500 p max_ip a_da
/ selection = stepwise;
output out = logistic_out2   PREDICTED= p;
quit;

data logistic_out2;
set logistic_out2;
if p2 >= 0.5 then p2=1;
else p2=-1;
run;
data training2 testing2;
set logistic_out2;
if Selected =0 then output training2;
else output testing2;
run;
proc freq data = training2;
table Classcar * p2 / nocol nopercent ;
title "逐步迴歸 結果 training";
run;
proc freq data = testing2;
table Classcar_test * p2/ nocol nopercent ;
title "逐步迴歸 結果 testing";
run;title;

*/

data set: https://archive.ics.uci.edu/ml/datasets/Breast+Tissue
Y:Classcar (惡性腫瘤)
I0 Impedivity (ohm) at zero frequency
PA500 phase angle at 500 KHz
HFS high-frequency slope of phase angle
DA impedance distance between spectral ends
AREA area under spectrum
A/DA area normalized by DA
MAX IP maximum of the spectrum
DR distance between I0 and real part of the maximum frequency point
P length of the spectral curve





Read More

2017年3月9日 星期四

2016年10月5日 星期三

統計學習(statistical learning)


  • 作業一(線性迴歸 複習)





===============================================

Model Selection

- R square
- adjusted r squared

- AIC SBC
書名:医学研究中的logistic回归分析及SAS实现和医学案例统计
- CP

書名:introduction to linear regression analysis
書名:introduction to linear regression analysis






























       一般來說,小CP值的模型會比較好。

- Forward
       先建立只有一個解釋變數的模型,這個變數是所有變數中最顯著的,在放入剩餘變數中最顯著的變數,直到剩餘變數沒有顯著的。

- Backward
       先建立一個包含所有解釋變數的模型,在拿去最不顯著的變數,直到模型內變數都為顯著。

- Stepwise
       先建立只有一個解釋變數的模型,這個變數是所有變數中最顯著的,在放入剩餘變數中最顯著的變數。加入新變數時,如前一個變數變為不顯著則可替除。

Comparison with the Maximal Model

- deviance

 
       判斷 logistic regression model 擬合程度。

書名:introduction to linear regression analysis

書名:introduction to linear regression analysis
       飽和模型為估計值與觀測值完全相等,為不合理的一件事,但可以拿現有模型與飽和模型進行比較,來評估現有模型擬合數據的充分程度。

       deviance值越大,表示現有模型與飽和模型差別越大,擬合效果差。
       deviance服從卡方分配。






Diagnosis Methods
- Residual Analysis
書名:introduction to linear regression analysis
RESIDUALS


STANDARDIZED RESIDUALS

大於3可能有問題


 PRESS RESIDUALS




很大可能有問題

大於2可能有問題


殘差圖
        
           如果殘差是獨立同分布,且服從N(0,sigma^2),那麼各點應該隨機的散佈如左上。
           
           右上圖,殘差隨著解釋變數呈現二次形狀,可以嘗試在迴歸模型上添加二次項的變數。

           左下圖,說明了殘差的變異數不是相等的,隨著解釋變數增大,變異數也增大。這種情況下可以嘗試將Y做轉換。

           右下圖,說明了殘差不是獨立的。

書名:SAS深入解析


Leverage and Influence Analysis


COOK'S D

Di > 1 可能為影響點 或 Di > p/n 可能為影響點。 

書名:introduction to linear regression analysis


書名:introduction to linear regression analysis


DFFITS     DFBETAS

|DFBETASj,i| > 2 / sqrt(n) 可能為影響點。
|DFFITSi| > 2*sqrt(p/n) 可能為影響點。

書名:introduction to linear regression analysis
書名:introduction to linear regression analysis









covratio

COVRATIOi > 1+3p/n 或 COVRATIOi  < 1-3p/n 可能為影響點。   
書名:introduction to linear regression analysis























- Check for multicollinearity

r12表示x1與x2的相關係數,如趨近到1,則C會趨近到無限大。
var(b-hat) = C * MSres。

書名:introduction to linear regression analysis

書名:introduction to linear regression analysis


迴歸sas程式碼點此








=========================================================================
  • 作業二(Wine Quality)



以直方圖與散佈圖顯示資料
macro程式碼點此
%mynormalcorr(mydata = red, y = quality,x = fixed_acidity
   volatile_acidity
    citric_acid
    residual_sugar
    chlorides
    free_sulfur_dioxide
    total_sulfur_dioxide
    density
    pH
    sulphates
    alcohol)
雖然response為1到10的level,但並非每個level都有觀測值,所以無法進行multinomial logistic regression,只好當成連續變數。


















變數間應該不會有共線性問題。




%myreg(data = red, y = Quality, x =  fixed_acidity
   volatile_acidity
    citric_acid
    residual_sugar
    chlorides
    free_sulfur_dioxide
    total_sulfur_dioxide
    density
    pH
    sulphates
    alcohol,
 option = selection = stepwise)

















不管變數如何選擇,都會出現這個情形,可能是我把response當成連續的關係,但有的level沒有觀測值也不能做multinomial logistic regression
或者response要進行轉換吧。






我使用stepwise方法進行變數選擇,但R平方只有0.35。


也有使用過其他方法但R平方都在0.2到0.3。









變數都為顯著,也沒共線性問題。

chlorides   free_sulfur_dioxide   total_sulfur_dioxide 可以看出這三張圖是有問題的,所以我決定拿掉。






但R平方還是一樣低








而且有非常多的觀測值可能為影響點。


我先以DFFITS為指標有超出的,以及outlier直接刪除試試看。






data test2_2;
	merge test2 outlier;
	if 1 <= _n_ <=3 then delete;
run;

data red2;
	merge red test2_2;
	if y3 ^= . then delete;
	if RsByLevGroup = "Outlier" then delete;
	keep Quality volatile_acidity pH sulphates alcohol;
run;



%myreg(data = red2, y = Quality, x =  volatile_acidity pH sulphates alcohol)








R平方變成0.44,還是很低。






Read More