**this code is used to create 1000 bootstrapped samples from an
artificial pseudo-set of data from the EPIC gender data set.
From each of the 1000 bootstrapped samples we'll calculate the median
value of the therapeutic intervention scoring system (TISS-28) for
males and females and then take their differences. We will
subsequently calculate the 2.5% and 97.5% values as a confidence
interval for the gender-based median difference. ;
** the main contribution here is to show that bootstrapped datasets
are easily created using proc surveyselect as shown immediately below,
where the input dataset is called and then uniform random sampling
with replacement is invoked to created the bootstrapped datasets
which are stored together as one big dataset called Bootsets;
ODS LISTING CLOSE;
proc surveyselect data=SimpBootDataMurphy
method=urs n=100
rep=1000
outhits /* if you leave this out you only get non-repeated records*/
seed=041561 out=BootSets;
run;
ODS LISTING;
** the variable replicate indicates the specific bootstrap sample;
proc sort data=BootSets;
by replicate;
run;
data BootMaleSubGroups;
set Bootsets;
where male = 1;
run;
data BootFemaleSubGroups;
set Bootsets;
where male = 0;
run;
** save the male medians;
proc means data=BootMaleSubGroups noprint ;
title 'Medians from Male Subgroups of 1000 Resamplings ';
var tiss28;
by replicate;
output out=MaleMedians median=MaleMedian;
run;
* save the female medians;
proc means data=BootFemaleSubGroups noprint ;
title 'Medians from Female Subgroups of 1000 Resamplings ';
var tiss28;
by replicate;
output out=FemaleMedians median=FemaleMedian;
run;
** merge the male and female medians and calculate their differences;
data AllMedians;
merge MaleMedians FemaleMedians;
by replicate;
MedianDiff = MaleMedian - FemaleMedian;
run;
** now calculate the mean of the 1000 median differences from bootstrapping;
proc means data=AllMedians;
title 'Mean Estimate of Median Differences (Male - Female)';
var MedianDiff;
output out=MedDiffStats;
run;
proc print data=MedDiffStats;
run;
** now calculate the confidence limits of the gender based median differences;
proc univariate data= AllMedians;
title 'Bootstrap 95% CI for (Male Median - Female Median)';
var MedianDiff;
output out=percentiles pctlpre=percent pctlpts=2.5, 97.5;
run;
proc print data=percentiles; run;
** now generate a histogram of the median differences;
title 'Histogram of MaleMedian minus FemaleMedian';
title2 'from 1000 Bootstrap Samples';
proc univariate data=allMedians noprint;
var MedianDiff;
*where upcase(variable)=upcase('stresdev');
histogram MedianDiff / cframe = ligr
cfill = blue;
run;
** Hope this saves you as much time as it has me! ;