cas;
caslib _all_ assign;

data hitters;
set public.hitters;
if salary ne .;
run;
/********************************************************************/
/********************************************************************/
/*************  PRINCIPAL COMPONENT REGRESSION   ********************/
/********************************************************************/
/********************************************************************/
/*Determine the number of components to use via cross-validation.
Since using all components is equivalent to using all variables exactly 
as they are, it can be easy to overfit by using too many components.*/

proc pls data=hitters method=pcr
		cv=random(niter=10 seed=13);
class league division;
model salary = years walks runs rbi putouts league hmrun 
			hits errors division cwalks cruns crbi chmrun
			catbat atbat assists ;
run;
/*Once we determine the number of factors via cross-validation, we'll
finalize our model using all the training data.*/

proc pls data=hitters method=pcr nfac=4;
class league division;
model salary = years walks runs rbi putouts league hmrun 
			hits errors division cwalks cruns crbi chmrun
			catbat atbat assists / solution;
run;

/********************************************************************/
/********************************************************************/
/*********************  PARTIAL LEAST SQUARES   *********************/
/********************************************************************/
/********************************************************************/
/*SVD algorithm is most accurate. May crawl for very large datasets.
For large datasets, use the default which is the NIPALS algorithm.
If performance is still too slow, consider setting a convergence 
criteria (tolerance) epsilon =10^(-5) or decreasing the maxiter.*/
/********************************************************************/
proc pls data=hitters method=pls(algorithm=svd)
		cv=random(niter=10 seed=13);
class league division;
model salary = years walks runs rbi putouts league hmrun 
			hits errors division cwalks cruns crbi chmrun
			catbat atbat assists;
run;

/*Once we determine the number of factors via cross-validation, we'll
finalize our model using all the training data.*/

proc pls data=hitters method=pls(algorithm=svd)
		nfac=2;
class league division;
model salary = years walks runs rbi putouts league hmrun 
			hits errors division cwalks cruns crbi chmrun
			catbat atbat assists / solution;
run;

/********************************************************************/
/********************************************************************/
/************************  VARIABLE CLUSTERING   ********************/
/********************************************************************/
/********************************************************************/


/* as you increase the maxeigen option, the coarseness of the clusters should increase*/
/* in other words, the number of clusters found should decrease. */

proc varclus data=public.hitters maxeigen=0.7;
var years walks runs rbi putouts hmrun 
			hits errors cwalks cruns crbi chmrun
			catbat atbat assists;
run;

/********************************************************************/
/********************************************************************/
/*                     Try with the IPIP survey data:               */
/********************************************************************/
/********************************************************************/

proc varclus data=public.ipip maxeigen=2;
var e1--o10;
run;