cas; caslib _all_ assign; /***********************************************************************/ /***********************************************************************/ /****** IMPLEMENT OLS REGRESSION AND SCORE TEST DATA *********/ /***********************************************************************/ /***********************************************************************/ proc reg data=public.bigdatapcr(datalimit=all where=(test=0)) outest=OLS; OLS: model target=v1--v120 /vif; run; proc score data=public.bigdatapcr(datalimit=all where=(test=1)) score=OLS type=parms predict out=out; var v1--v120; run; /***********************************************************************/ /***********************************************************************/ /******** IMPLEMENT PRINCIPAL COMPONENTS REGRESSION **********/ /***********************************************************************/ /***********************************************************************/ /***********************************************************************/ /************** USE CV TO DETERMINE OPTIMAL # COMPONENTS ***************/ /***********************************************************************/ proc pls data=public.bigdatapcr(datalimit=all where=(test=0)) method=pcr cv=random(niter=2 seed=100816); model target=v1--v128; run; /***********************************************************************/ /************** CREATE MODEL USING DECIDED # COMPONENTS ***************/ /***********************************************************************/ /* can't output scoring parameters directly, so I'm grabbing them from the ods output and i'll create a table that works for proc score in the next step */ ods output parameterestimates=PCR_estimates; proc pls data=public.bigdatapcr(datalimit=all where=(test=0)) method=pcr nfac=15; model target=v1--v120/solution; run; /***********************************************************************/ /* TRANSFORM THE ODS OUTPUT TABLE INTO A TABLE SUITABLE FOR PROC SCORE */ /***********************************************************************/ proc transpose data=PCR_estimates out=PCR_estimates; id rowname; run; data PCR_estimates; set PCR_estimates; _type_='parms'; _model_='PCR'; _depvar_='Overall'; drop _name_; run; /***********************************************************************/ /********* SCORE THE TEST DATA WITH PCR MODEL ***************/ /***********************************************************************/ proc score data=out score=PCR_estimates type=parms predict out=out2; var v1--v120; run; /***********************************************************************/ /***********************************************************************/ /******** IMPLEMENT PARTIAL LEAST SQUARES REGRESSION **********/ /***********************************************************************/ /***********************************************************************/ /***********************************************************************/ /************** USE CV TO DETERMINE OPTIMAL # COMPONENTS ***************/ /***********************************************************************/ proc pls data=public.bigdatapcr(datalimit=all where=(test=0)) method=pls cv=random(niter=2 seed=100816); model target=v1--v128; run; /***********************************************************************/ /************** CREATE MODEL USING DECIDED # COMPONENTS ***************/ /***********************************************************************/ /* can't output scoring parameters directly, so I'm grabbing them from the ods output and i'll create a table that works for proc score in the next step */ ods output parameterestimates=PLS_estimates; proc pls data=public.bigdatapcr(datalimit=all where=(test=0)) method=pls nfac=15; model target=v1--v120/solution; run; /***********************************************************************/ /* TRANSFORM THE ODS OUTPUT TABLE INTO A TABLE SUITABLE FOR PROC SCORE */ /***********************************************************************/ proc transpose data=PLS_estimates out=PLS_estimates; id rowname; run; data PLS_estimates; set PLS_estimates; _type_='parms'; _model_='PLS'; _depvar_='Overall'; drop _name_; run; /***********************************************************************/ /********* SCORE THE TEST DATA WITH PLS MODEL ***************/ /***********************************************************************/ proc score data=out2 score=PLS_estimates type=parms predict out=out3; var v1--v120; run; /***********************************************************************/ /***********************************************************************/ /******** COMPARE THE TWO SOLUTIONS. PCR WITH 15 FACTORS MUCH *******/ /********** BETTER THAN OLS WITH 120 ***********/ /***********************************************************************/ /***********************************************************************/ proc corr data=out3; var PCR ols PLS; with target; run; proc sql; select sqrt(mean((OLS - target)**2)) as OLS, sqrt(mean((PCR - target)**2)) as PCR, sqrt(mean((PLS - target)**2)) as PLS from out3; run; proc sgscatter data=out3; plot target*(pcr ols pls) /reg; run;