# This set of routines can do a simple CA and present results in various types of plots. # We have code for graph plots, joint plots, transformation plots, regression plots and # benzecri distance plots. # # version 1.2 2005-01-27 # version 1.3 2005-11-25 (benzecri plot, outfile, chi-square decomposition) # version 1.4 2005-11-25 (added canonical correspondence analysis) # version 1.5 2006-01-15 (added reconstitution of order zero) # # ca() is either simple correspondence analysis or canonical correspondence analysis, i.e. # correspondence analysis with linear constraints on row and/or column scores ca<-function(tab, colcov=FALSE, rowcov=FALSE, pdim=2, scale="BEN", save=TRUE, regresplot=FALSE, regresjoint=TRUE, regresdim=1, transplot=FALSE, transjoint=FALSE, transcol1="red", transcol2="green", transdim=1, rowplot=FALSE, rowcol="black", rowlab=TRUE, rowdims=c(1,2), colplot=FALSE, colcol="black", collab=TRUE, coldims=c(1,2), jointplot=FALSE, jointlines=1, jointdims=c(1,2), jointcol1="red", jointcol2="green", jointlab1=TRUE, jointlab2=TRUE, benzplot=FALSE, outwrite=TRUE) { name<-deparse(substitute(tab)); n<-dim(tab)[1]; m<-dim(tab)[2] tau<-sum(tab); tab<-as.matrix(tab); r<-rowSums(tab); c<-colSums(tab); qdim<-pdim+1 r<-ifelse(r==0,1,r); c<-ifelse(c==0,1,c); ROW<-FALSE; COL<-FALSE z<-tab/sqrt(outer(r,c)); chisq<-tau*(sum(z^2)-1) if (is.matrix(rowcov)) { xr<-cbind(rep(1,n),rowcov) cx<-crossprod(xr,r*xr); nr<-dim(xr)[2]; or<-symSqrt(cx,inv=TRUE); ROW<-TRUE } if (is.matrix(colcov)) { yr<-cbind(rep(1,m),colcov) cy<-crossprod(yr,c*yr); mr<-dim(yr)[2]; oc<-symSqrt(cy,inv=TRUE); COL<-TRUE } if (ROW) z<-or%*%crossprod(xr,tab) else z<-tab/sqrt(r) if (COL) z<-z%*%yr%*%oc else z<-z/outer(rep(1,dim(z)[1]),sqrt(c)) sv<-svd(z,nu=qdim,nv=qdim); sval<-tau*((sv$d[-1])^2) l<-(sv$d)[2:qdim]; if (ROW) x<-(xr%*%or%*%(sv$u))[,-1] else x<-((sv$u)/sqrt(r))[,-1] if (COL) y<-(yr%*%oc%*%(sv$v))[,-1] else y<-((sv$v)/sqrt(c))[,-1] if (scale=="YCX") y<-y*outer(rep(1,m),l) if (scale=="XCY") x<-x*outer(rep(1,n),l) if (scale=="GOO") { y<-y*outer(rep(1,m),sqrt(l)) x<-x*outer(rep(1,n),sqrt(l)) } if (scale=="BEN") { y<-y*outer(rep(1,m),l) x<-x*outer(rep(1,n),l) } if (regresplot) regres_plot(tab,name,x[,regresdim],y[,regresdim],regresjoint) if (transplot) trans_plot(name,x[,transdim],y[,transdim],transjoint,transcol1,transcol2) if (rowplot) row_plot(name,x[,rowdims],if (rowlab) labels(tab)[[1]] else rep("*",n),rowcol) if (colplot) col_plot(name,y[,coldims],if (collab) labels(tab)[[2]] else rep("*",m),colcol) if (jointplot) joint_plot(tab,name,x[,jointdims],y[,jointdims],if (jointlab1) labels(tab)[[1]] else rep("*",n), if (jointlab2) labels(tab)[[2]] else rep("*",m), jointlines, jointcol1, jointcol2) if (benzplot) { cc<-z%*%t(z); if (ROW) v<-xr%*%or%*%cc%*%or%*%t(xr) else v<-cc/sqrt(outer(r,r)) benzplot_row(v,tau,name,x) cc<-crossprod(z); if (COL) v<-yr%*%oc%*%cc%*%oc%*%t(yr) else v<-cc/sqrt(outer(c,c)) benzplot_col(v,tau,name,y) } if (outwrite) { outfile<-file(paste(name,"_out",".txt",sep=""),"w") write_score_matrix(x,r,"Row Scores",labels(tab)[[1]],outfile) write_score_matrix(y,c,"Column Scores",labels(tab)[[2]],outfile) write_chisq_table(sval,chisq,outfile) close(outfile) } if (save) list(x=x,y=y,l=l) } # rlines() makes a regression plot for a table and a set of scores. rlines<-function(tab,x,y) { tau<-sum(tab); pr<-tab/tau; r<-rowSums(pr); c<-colSums(pr) n<-dim(tab)[1]; m<-dim(tab)[2] xave<-as.vector(as.matrix(pr)%*%y)/r yave<-as.vector(x%*%as.matrix(pr))/c z<-c(x,y); plot(z,z,type="n",xlab="x",ylab="y") points(x,xave,type="l") points(yave,y,type="l") abline(v=x); abline(h=y) for (i in 1:n) text(rep(x[i],m),y,as.character(tab[i,]),cex=.8) } # regres_plot() makes a before and after regression plot. regres_plot<-function(tab,name,x,y,joint) { n<-length(x); m<-length(y) if (joint) pdf(paste(name,"_regres",".pdf",sep="")) else pdf(paste(name,"_before",".pdf",sep="")) rlines(tab,1:n,1:m) if (!joint) { dev.off() pdf(paste(name,"_after",".pdf",sep="")) } rlines(tab,x,y) dev.off() } # trans_plot() makes row and column or row/column transformation plots. trans_plot<-function(name,x,y,row,col,joint,col1,col2) { n<-length(x); m<-length(y) pdf(paste(name,"_trans",".pdf",sep="")) if (joint) { plot(c(1:n,1:n),c(x,y),type="n",xlab="number",ylab="score") lines(1:n,x,col=col1) } else plot(1:n,x,type="l",col=col1,xlab="number",ylab="score") points(1:n,x,pch=row) if (!joint) plot(1:m,y,type="l",col=col2,xlab="number",ylab="score") else lines(1:m,y,col=col2) points(1:m,y,pch=col) dev.off() } # row_plot() makes a labeled or unlabeled plot of the row scores row_plot<-function(name,x,rows,col) { n<-dim(x)[1] pdf(paste(name,"_rows",".pdf",sep="")) plot(x,type="n") text(x,rows,col=col) dev.off() } # col_plot makes a labeled or unlabeled plot of the column scores col_plot<-function(name,y,cols,col) { n<-dim(y)[1] pdf(paste(name,"_cols",".pdf",sep="")) plot(y,type="n") text(y,cols,col=col) dev.off() } # joint_plot() makes a labelled or unlabeled biplot, using one of four different scalings. # The graph plot can be drawn in, with line thickness indicating strength of pull. joint_plot<-function(tab,name,x,y,rows,cols, wlines,col1,col2){ n<-dim(tab)[1]; m<-dim(tab)[2] rg<-round(wlines*tab/max(tab)) pdf(paste(name,"_joint",".pdf",sep="")) plot(rbind(x,y),type="n") text(x,rows,col=col1) text(y,cols,col=col2) if (wlines > 0) for (i in 1:n) for (j in 1:m) if (rg[i,j] > 0) lines(c(x[i,1],y[j,1]),c(x[i,2],y[j,2]),lwd=rg[i,j]) dev.off() } benzplot_row<-function(v,nn,name,x) { w<-diag(v); do<-nn*(outer(w,w,"+")-2*v); dm<-max(do); dim<-dim(x)[2]; n<-dim(x)[1] c<-x%*%t(x); w<-diag(c); dr<-nn*(outer(w,w,"+")-2*c) pdf(paste(name,"_benz_row",".pdf",sep="")) plot(do,dr,xlim=c(0,dm),ylim=c(0,dm),xlab="observed",ylab="fitted", main=paste("Benzecri Distances Rows",name,"p =",as.character(dim),sep=" "), axes=FALSE) abline(0,1); abline(h=0); abline(v=0) for (i in 1:n) for (j in 1:i) lines(c(do[i,j],do[i,j]),c(0,dr[i,j])) dev.off() } benzplot_col<-function(v,nn,name,y) { w<-diag(v); do<-nn*(outer(w,w,"+")-2*v); dm<-max(do); dim<-dim(y)[2]; m<-dim(y)[1] c<-y%*%t(y); w<-diag(c); dr<-nn*(outer(w,w,"+")-2*c) pdf(paste(name,"_benz_col",".pdf",sep="")) plot(do,dr,xlim=c(0,dm),ylim=c(0,dm),xlab="observed",ylab="fitted", main=paste("Benzecri Distances Cols",name,"p =",as.character(dim),sep=" "), axes=FALSE) abline(0,1); abline(h=0); abline(v=0) for (i in 1:m) for (j in 1:i) lines(c(do[i,j],do[i,j]),c(0,dr[i,j])) dev.off() } write_score_matrix<-function(x,r,header,labs,ofile) { s<-NULL; sl<-35+10*dim(x)[2]; for (i in 1:sl) s<-paste(s,"*",sep="") u<-NULL; sl<-35+10*dim(x)[2]; for (i in 1:sl) u<-paste(u,"-",sep="") cat(formatC(s,format="s"),"\n",file=ofile) cat(header,"\n",file=ofile) cat(formatC(u,format="s"),"\n",file=ofile) for (k in 1:dim(x)[1]) cat(formatC(labs[k],format="s",width=10,flag="-"),formatC(r[k],width=5)," *** ", formatC(x[k,],digits=6,width=10,format="f")," *** ", formatC(sum(r[k]*x[k,]^2),digits=6,width=10,format="f"),"\n",sep="",file=ofile) cat(formatC(u,format="s"),"\n",file=ofile) cat(formatC(" ",format="s",width=15)," *** ", formatC(apply(r*(x^2),2,sum),digits=6,width=10,format="f")," *** ", formatC(sum(r*(x^2)),digits=6,width=10,format="f"),"\n",sep="",file=ofile) cat(formatC(s,format="s"),"\n\n\n",file=ofile) } write_chisq_table<-function(sval,cs,ofile) { s<-NULL; sl<-55; for (i in 1:sl) s<-paste(s,"*",sep="") u<-NULL; sl<-55; for (i in 1:sl) u<-paste(u,"-",sep="") prob<-sval/cs; pcum<-cumsum(prob) cat(formatC(s,format="s"),"\n",file=ofile) cat("Chi-square Decomposition\n",file=ofile) cat(formatC(u,format="s"),"\n",file=ofile) for (k in 1:length(sval)) cat(formatC("component",format="s",width=10,flag="-"),formatC(k,width=5)," *** ", formatC(sval[k],digits=6,width=15,format="f"), formatC(prob[k],digits=6,width=10,format="f"), formatC(pcum[k],digits=6,width=10,format="f"),"\n",sep="",file=ofile) cat(formatC(u,format="s"),"\n",file=ofile) cat(formatC("chisquare",format="s",width=15,flag="-")," *** ", formatC(cs,digits=6,width=15,format="f"),"\n",sep="",file=ofile) cat(formatC(s,format="s"),"\n\n\n",file=ofile) } symSqrt<-function(cc,inv=FALSE) { e<-eigen(cc); ev<-e$values; ind<-which(ev>0); lbd<-rep(0,length(ev)) if (inv) lbd[ind]<-1/sqrt(ev[ind]) else lbd[ind]<-sqrt(ev[ind]) kk<-e$vectors; return(kk%*%(lbd*t(kk))) } reconstitute<-function(tab,order=0,eps=1e-6,verbose=FALSE) { x0<-ifelse(is.na(tab),0,tab); xold<-x0; fold<--Inf repeat { rx<-rowSums(xold); cx<-colSums(xold); tx<-sum(xold) y<-outer(rx,cx)/tx xnew<-ifelse(is.na(tab),y,tab) fnew<-sum(x0*log(ifelse(y==0,1,y))) if (verbose) print(c(fold,fnew)) if ((fnew - fold) < eps) break() xold<-xnew; fold<-fnew } return(xnew) }