/* 
  Macro: %mi_combine
  Author: Paul von Hippel
  Version: 1.0
  Date: July 19, 2017
  Summary: This macro combines the multiple point estimates and standard errors produced by
            analyzing multiply imputed (MI) datasets as though they were complete.
           It outputs a single MI point estimate and SE estimate and confidence interval,
            as well as an estimate and confidence interval for the fraction of missing information.
           It estimates the number of imputations that would be needed to produce an SE 
            estimate that is replicable according to the criterion specified by the argument
            target_sd_se.
           This macro can be used to implement a two-stage procedure for selecting an appropriate number of imputations,
            as described in my article "How many imputations do you need?"
           An implementation of the two-step procedure that uses this macro is avialable
            in the file  
 */
  

%macro mi_combine (inests=, /* Input dataset, which contains M point estimates and SEs 
                               produced by analyzing multiply imputed (MI) datasets 
                               as though they were complete. */
                   outests=mi_ests, /* Output dataset, containing
				                        a single MI point estimate, SE estimate, and confidence interval (CI),
                                        an estimate and CI for the fraction of missing information              
				                        a recommendation for the number of imputations that would be needed 
				                         to produce an SE estimate that is replicable 
				                         according to the criterion specified by the argument target_sd_se. */
                   est=, /* Column in the input dataset which contains point estimates. */
                   se=, /* Column in the input dataset which contains SE estimates. */
                   label=, /* Column in the input dataset which indicates what parameter the 
				               point estimate and SE estimate refer to 
				               -- e.g., the intercept, and slope of X1 and X2 in a regression model. */
				   target_sd_se=.01, /* A target for the SD of an SE estimate across all possible
				                         sets of M imputations. */
                   df_comp=100000, /* The degrees of freedom that the estimates would have 
				                       if the data were complete 
                                       -- e.g., df_comp=n-1 for a simple linear regression.
				                      This is only necessary for small datasets and is used 
				                       to calculate the small-sample MI df formula of Barnard & Rubin (1999).
			                        */
                   confidence=.95, /* Confidence level for all confidence intervals in the output */
                   print=1, /* 1 if you want the results printed to the Output or Results window.
				               0 if you don't. */
                   df_min=3 /* Puts a lower bound of 3 on the estimated degrees of freedom. 
				                See von Hippel (2015) for justification. */
);
data work.mi_input;
 set &inests;
 SESq = &se**2;
 df_comp = &df_comp;
run;
proc sort data=work.mi_input;
  by &label;
run;
proc means data=work.mi_input mean var n;
 var &est SESq df_comp;
 by &label;
 ods output Summary=&outests;
run;
data &outests;
 set &outests;
 Est = &est._mean;
 imputations = &est._n;
 within_var = SESq_Mean;
 between_var = (1+1/imputations) * &est._Var;
 total_var = within_var + between_var;
 SE = sqrt (total_var);
 t = Est / SE;
 frac_missing = between_var / total_var; /* Can be 0 or 1 in rare cases */
 df_comp = df_comp_mean;
 df_obs = (df_comp+1)/(df_comp+3) * (1-frac_missing) * df_comp;
 if frac_missing=0 then do;
  df_rubin=5000;
  df=df_obs;
 end;
 else do;
  df_rubin = (imputations - 1) / frac_missing**2; /* Rubin (1987). OK for large samples. */
  df = (1/df_rubin + 1/df_obs)**-1; /* Barnard & Rubin (1999). Better, esp. in small samples */
  df = max (&df_min, df); /* Lower bound. von Hippel (2014) */
 end;
 p = 2 * (1-probt(abs(t),df));
 ci_half_width = se * tinv (1-(1-&confidence)/2, df);
 lcl = est - ci_half_width;
 ucl = est + ci_half_width;
 confidence = &confidence;
 %let to_keep = &label imputations est se df confidence lcl ucl t p frac_missing;
 keep &to_keep /* df_comp */ ;
run;
data &outests;
 retain &to_keep;
 set &outests;
run;
data &outests;
 set &outests;
 logit_frac_missing = log (frac_missing / (1-frac_missing));
 logit_SE = sqrt (2 / imputations);
 z = QUANTILE('NORMAL', 1 - (1-&confidence)/2);
 logit_lcl = logit_frac_missing - z * logit_SE;
 logit_ucl = logit_frac_missing + z * logit_SE;
 frac_missing_lcl = logistic (logit_lcl);
 frac_missing_ucl = logistic (logit_ucl);
 format frac_missing: 4.2;
 format df 6.0;
 target_SD_SE = &target_SD_se;
 target_cv_se = &target_SD_se / se;
 recommended_imputations = ceil (1 + (1/2) * (frac_missing_ucl / target_cv_se)**2);
 drop logit_: z;
run;
%global recommended_imputations;
proc sql;
  select max(recommended_imputations) into :recommended_imputations
  from &outests;
quit;
%if &print=1 %then %do;
proc print data=&outests;
run;
%end;
%mend;

/*References*/
/**/
/*Barnard, J., & Rubin, D. B. (1999). Small-sample degrees of freedom with multiple imputation. */
/*                     Biometrika, 86(4), 948955. https://doi.org/10.1093/biomet/86.4.948*/
/*von Hippel, P.T. (2015). New confidence intervals and bias calculations show that */
/*    maximum likelihood can beat multiple imputation in small samples. */
/*    Structural Equation Modeling, 23(3): 423-437. https://arxiv.org/abs/1307.5875.*/
