########Required packages#####3 library(GenomicFeatures) library(rtracklayer) library(Rsamtools) library(GenomicRanges) library(edgeR) library(Mfuzz) library(DESeq) library(DESeq2) library(org.Hs.eg.db) library(goseq) library(SPIA) library(ggplot2) library(GO.db) library(RColorBrewer) library(igraph) library(RCytoscape) ############parallel computing########### library(foreach) library(parallel) library(multicore) library(doMC) ######set number of cores for parallel computing##################################################### ######a majority of the functions require at least 2 cores########################################### ######we typically run with 20 cores out of 24 and we have 48GB of RAM############################### ######and some functions are too memory intensive so we run less cores during those analyses######### registerDoMC(cores=20) ######################################################## ######create databases and objects need for analysis##### ######################################################## ###########create annotation file objects and databases###### human19<-makeTranscriptDbFromUCSC("hg19","knownGene") h19e<-exons(human19) h19eg<-exonsBy(human19, "gene") load("lincRNAbase") RNAs<-c(h19eg,lincRNA) #############make an object of gene lengths######## IDS<-names(RNAs) genelengths<-foreach(i=1:length(IDS),.combine='c') %dopar% { y<-RNAs[i] y<-y@unlistData y<-y@ranges w<-y@width sum(w) } names(genelengths)<-IDS #############make an object of exon lengths######## IDS.ex<-h19e@elementMetadata$exon_id y<-h19e@ranges exonlengths<-y@width names(exonlengths)<-IDS.ex ##############create an index for exon to gene identification ################### blue<-foreach(i=1:length(h19eg)) %dopar%{ x<-h19eg[[i]] x<-x@elementMetadata x<-x[,1]} names(blue)<-names(h19eg) exon.to.gene.ids<-foreach(i=1:length(blue), .combine='rbind') %dopar% { y<-cbind(blue[[i]],rep(names(blue[i]),length(blue[[i]])))} y<-exon.to.gene.ids[,2] names(y)<-exon.to.gene.ids[,1] exon.to.gene.ids<-y #######create disease database for R from http://diseases.jensenlab.org/Entity?textmining=10&type1=9606&type2=-26&id1=ENSP00000362296 ####### ########of gene disease associations with a z-score higer than or equal to 1.952790 which is equivalent to p<0.5###### x<-read.delim("disease.tsv", header=F) y<-which(x[,5]>=1.952790) x<-x[y,] y<-as.character(unique(x[,2])) xs<-x[,2] z<-as.character(x[,4]) w<-foreach(i=1:length(y)) %dopar% { m<-which(xs==y[i]) z[m]} names(w)<-y disease<-w z<-names(SYMBOLS) names(z)<-SYMBOLS names(disease)<-z[names(disease)] #######Create a marker list from zeng et al (Cell, 2012) paper out of allen brain institute########## #######gleaned from figure 6 and made into a .csv############### #########converted matrix into a list of list####### allen<-read.delim("allen Layer markers.csv",header=TRUE) rownames(allen)<-allen[,1] allen<-allen[,2:7] markers<-foreach(i=1:dim(allen)[1]) %dopar% { x<-allen[i,] y<-which(x>0) colnames(allen)[y]} names(markers)<-rownames(allen) #########make a disease to gene list############### disease.names<-goseq:::reversemapping(disease) ################################################# ######make co-ciatation network################ ################################################# ###get genes to pmids##### y<-as.list(org.Hs.egPMID) x<-intersect(names(y),names(h19eg)) PMID<-foreach(i=1:length(x)) %dopar% { xs<-unlist(y[which(x[i]==names(y))])} names(PMID)<-x ######remove genes without citations###### x<-unlist(PMID) y<-unique(x) genes.per.article<-foreach(i=1:length(y),.combine='c') %dopar% { length(which(y[i]==x))} names(genes.per.article)<-y z<-c(names(genes.per.article[which(genes.per.article<=1)]),names(genes.per.article[which(genes.per.article>1000)])) x<-foreach(i=1:length(PMID)) %dopar% { setdiff(PMID[[i]],z)} names(x)<-names(PMID) PMID.short<-x ##########make vector of papers per gene#################### PMID.len<-foreach(i=1:length(PMID.short),.combine='c') %dopar% { length(PMID.short[[i]])} names(PMID.len)<-names(PMID.short) y<-names(h19eg) x<-intersect(names(PMID.len),names(h19eg)) b<-setdiff(y,x) z<-(rep(0,length(b))) names(z)<-b PMID.len<-c(PMID.len,z) ######create vector of genes per paper####### x<-goseq:::reversemapping(PMID.short) paper.len<-foreach(i=1:length(x),.combine='c') %dopar% { length(x[[i]])} names(paper.len)<-names(x) ################################################## y<-names(h19eg) green<-names(paper.len) x<-paper.len/1000 weight<-1-x connections<-foreach(b=1:length(y),.combine='rbind') %dopar% { blue<-PMID.short[[y[b]]] x<-foreach(i=1:length(y),.combine='c') %do% { z<-intersect(blue,unlist(PMID.short[y[[i]]])) sum(weight[z]) } } rownames(connections)<-y colnames(connections)<-y z<-foreach(i=1:length(y),.combine='c') %dopar% { connections[i,i]} names(z)<-rownames(connections) b<-which(z>0.997) connect<-connections[b,b] for(b in 1:dim(connect)[1]) { connect[b,b]<-0} connect.gr<-graph.adjacency(connect,"undirected",weighted=TRUE,add.rownames=TRUE) ###################################### #####Functions defined by user######## ###################################### ######################################################### ##########functions for enrichment testing############### ######################################################### #########uses goseq package to find enrichment of diseases in a list of gene vectors (entrez ids) with an FDR<=0.05##### #######needs an annotation object produced by 'exonsBy'######## #######needs a vector of the gene lengths##### ######needs a disease database ##### dis.cluster.enrich<-function(list.gene.vec,disease,anno.obj){ y<-names(anno.obj) genlen<-genelengths[sort(y)] magic<-foreach(i=1:length(list.gene.vec)) %dopar%{ x<-intersect(list.gene.vec[[i]],y) z<-setdiff(y,x) zs<-rep(0,length(z)) xs<-rep(1,length(x)) names(zs)<-z names(xs)<-x zt<-c(zs,xs) zt<-zt[sort(y)] pwf<-nullp(zt,bias.data=genlen,plot.fit=FALSE) blue<-goseq(pwf,gene2cat=disease) FDR<-p.adjust(blue[,2],"fdr") green<-cbind(blue[which(FDR<=0.05),1:2],FDR[which(FDR<=0.05)]) } } ###########function to get genes driving a disease from your gene vector (entrez ids) and write table to file################ ##########requires a matrix of the significantly changing genes####################### ###########requires a normalized matrix of gene counts################################ driven<-function(name.of.disease,your.gene.vector) { x<-disease.names[[name.of.disease]] blur<-intersect(x,your.gene.vector) z<-SYMBOLS[blur] x<-ave.cts.ceg[blur,] b<-cbind(edge.genes[blur,23],edge.genes[blur,28]) fin<-cbind(z,x,b) colnames(fin)<-c("Gene Symbols",colnames(ave.cts.ceg),"Fold Difference","FDR") write.csv(fin,paste(name.of.disease,".csv",sep=""))} #######uses goseq package to find enrichment of GO cats in a list of gene vectors (entrez ids)######## #######genome must be from UCSC genome browser and as character i.e. "hg19"############ ######cats must be "GO:CC","GO:BP","GO:MF", or "KEGG"###################### #######needs an annotation object ######## go.cluster.enrich<-function(list.gene.vec,cats,genome,anno.obj){ y<-names(anno.obj) GO<-getgo(y,genome,"knownGene",cats) genlen<-genelengths[sort(y)] magic<-foreach(i=1:length(list.gene.vec)) %dopar%{ x<-intersect(list.gene.vec[[i]],y) z<-setdiff(y,x) zs<-rep(0,length(z)) xs<-rep(1,length(x)) names(zs)<-z names(xs)<-x zt<-c(zs,xs) zt<-zt[sort(y)] pwf<-nullp(zt,bias.data=genlen,plot.fit=FALSE) blue<-goseq(pwf,gene2cat=GO) FDR<-p.adjust(blue[,2],"fdr") green<-cbind(blue[which(FDR<=0.05),1:2],FDR[which(FDR<=0.05)]) } } ##############Use SPIA package to find enrichment of KEGG patways in a list of gene vectors (entrez ids)############ #########memory intensive######### ########requires a fold difference between max and min values over timecourse############# #######requires a fold change over threshold########## SPIAcluster<-function(list.gene.vec){ fin<-foreach (i=1:length(lis)) %dopar% { x<-FD[list.gene.vec[[i]]] z<-FC[list.gene.vec[[i]]] if (length(which(z==Inf))>=1){ b<-which(z==Inf) blue<-replace(z,b,x[names(b)]) spia(de=blue,all=IDS,organism="hsa",nB=2000,verbose=FALSE)} else { spia(de=z,all=IDS,organism="hsa",nB=2000,verbose=FALSE)} } } ################################################################# ######network analysis functions utilizing igraph package######### ################################################################# ########takes two gene vectors (entrez ids) and finds common neighbors in a graph object######## ##########requires the adjacency matrix connected to the graph object########################### #########deg=degrees away from each node to establish neighborhood############################## two.cluster.gr<-function(graph.object,v1,v2,deg,adj.matrix) { x1<-neighborhood(graph.object,deg,intersect(b,rownames(adj.matrix))) x2<-neighborhood(graph.object,deg,intersect(z,rownames(adj.matrix))) y1<-foreach(i=1:length(x1),.combine='c') %dopar% { xs<-x1[[i]] ys<-foreach(b=1:length(x1),.combine='c') %do%{ intersect(x1[[b]],xs)} } y2<-foreach(i=1:length(x2),.combine='c') %dopar% { xs<-x2[[i]] ys<-foreach(b=1:length(x2),.combine='c') %do%{ intersect(x2[[b]],xs)} } y<-intersect(y1,y2) z<-induced.subgraph(graph.object,y) } #########makes a list of lists containing the eigenvector and Betweenness centrality scores for a list of gene vectors (entrez ids)######## ########normalizes betweenness scoers to max betweenness, i.e. betweenness.score/max.betweenness.score for easy comparison############ ##########requires the adjacency matrix connected to the graph object########################### vital.nodes.cl<-function(graph.object,list.gene.vec,deg,adj.matrix){ x<-foreach(i=1:length(list.gene.vec)) %dopar% { z<-neighborhood(graph.object,deg,intersect(list.gene.vec[[i]],rownames(adj.matrix))) y1<-foreach(m=1:length(z),.combine='c') %do% { xs<-z[[m]] xsn<-z[-m] ys<-foreach(b=1:length(xsn),.combine='c') %do%{ intersect(xsn[[b]],xs)} } y1<-c(V(graph)$names[y1],intersect(list.gene.vec[[i]],rownames(adj.matrix))) y1<-unique(y1) y<-induced.subgraph(graph.object,y1) blue<-evcent(y)$vector blue<-sort(blue[which(blue>0)]) red<-sort(betweenness(y,weight=1/E(y)$weight),decreasing=TRUE) red<-red/max(red) z<-list(blue,red) names(z)<-c("Eigenvector Centralities", "Betweenness") z} } ################################################################################### ##########functions to write .csv files for the various objects generated########## ################################################################################### #########write a table from an object produced by dis.cluster.enrich################ write.dis.cl<-function(lis,name){ blue<-foreach(i=1:length(lis), .combine='rbind') %dopar% { x<-lis[[i]] y<-rep(names(lis[i]),dim(x)[1]) cbind(y,x)} colnames(blue)<-c("Cluster", "Disease","p-value","FDR") write.csv(blue,paste(name,".csv",sep=""))} #######write a table from object produced by go.cluster.enrich when using clusters derived from mfuzz######## write.go.cl<-function(lis,clusters,name){ blue<-foreach(i=1:length(lis), .combine='rbind') %dopar% { x<-lis[[i]] if (dim(x)[1]==0) {} else{ genevec<-clusters[[i]] y<-rep(names(lis[i]),dim(x)[1]) z<-GOnames[x[,1]] names(z)<-rownames(x) poo<-foreach(b=1:length(z),.combine='rbind') %do% { m<-length(intersect(GOids[[x[b,1]]],genevec)) j<-length(GOids[[x[b,1]]]) cbind(m,j,m/j*100)} cbind(y,z,x,poo)}} colnames(blue)<-c("Cluster", "GO name","GO ID","p-value","FDR","# Genes","Size of GO category","% of category") write.csv(blue,paste(name,".csv",sep=""))} #######write a table from object produced by go.cluster.enrich for stages######## write.go.stage<-function(lis,stages,name){ blue<-foreach(i=1:length(lis), .combine='rbind') %dopar% { x<-lis[[i]] if (dim(x)[1]==0) {} else{ genevec<-stages[[i]] y<-rep(names(lis[i]),dim(x)[1]) z<-GOnames[x[,1]] poo<-foreach(b=1:length(z),.combine='rbind') %do% { m<-length(intersect(GOids[[x[b,1]]],genevec)) j<-length(GOids[[x[b,1]]]) cbind(m,j,m/j*100)} cbind(y,z,x,poo)}} colnames(blue)<-c("Stage", "GO name","GO ID","p-value","FDR","# Genes","Size of GO category","% of category") write.csv(blue,paste(name,".csv",sep=""))} #########write a table from object produced by goseq function########## write.GO<-function(lis,genevec,name) { x<-GOnames[lis[,1]] y<-foreach(i=1:length(x),.combine='rbind') %dopar% { z<-length(intersect(GOids[[lis[i,1]]],genevec)) b<-length(GOids[[lis[i,1]]]) cbind(z,b,z/b*100)} blue<-cbind(x,lis,y) colnames(blue)<-c("GO name","GO ID","p-value","FDR","# Genes","Size of GO category","% of category") write.csv(blue,paste(name,".csv",sep=""))} ########write kegg with zscore for stages list.gene.vectors#### write.kegg.stag<-function(lis,name) { blue<-foreach(i=1:length(lis), .combine='rbind') %dopar% { x<-lis[[i]] if (dim(x)[1]==0) {} else{ y<-rep(names(lis[i]),dim(x)[1]) z<-qnorm(1-(x[,10]/2)) cbind(x,y,z)}} colnames(blue)<-c(colnames(lis[[1]]),"Stage", "Z-Score") write.csv(blue,paste(name,".csv",sep=""))} #######write gene table##### full.table<-function(mat,ave.mat,lis,name){ blue<-foreach(i=1:length(lis), .combine='rbind') %dopar% { x<-lis[[i]] y<-names(lis[i]) y<-rep(y,length(x)) cbind(SYMBOLS[x],y,FD[x],FC[x],ave.mat[x,],mat[x,])} colnames(blue)<-c("Gene Symbols","Stages","Fold Difference (x/threshold)","Fold Difference (max/min)",colnames(ave.mat),colnames(mat)) write.csv(blue,paste(name,".csv",sep=""))} ###########write pdf showing cluster centers and number of genes in each cluster##### #####needs an object from mfuzz function output####### write.cl.pdf<-function(cluster.obj,times,name){ x<-as.character(1:dim(cluster.obj[[1]])[1]) xn<-as.character(cluster.obj[[2]]) ysg<-cluster.obj[[1]] pdf(paste(name,".pdf",sep=""),onefile=T,width=11) for(i in 1:length(x)) { y<-paste("Cluster ",x[i],sep="") y<-paste(y,"\n",sep="") z<-paste("Composed of ",xn[i],sep="") z<-paste(z," Genes",sep="") z<-paste(y,z) plot(times,ysg[i,], type="line",col="darkblue",xlab="Days post Induction",ylab="Relative Change in Expression",main=z)} dev.off()} ############################## #####start analysis######### ############################# #########read in bam files########### ########shown below is an example all files were read in the same way################ ###########makes big files so it is best to get rid of excess files between samples or you will rum out of memory######### x<-readGappedAlignments("sample.ID.bam",format="BAM") exon.sample.ID<-summarizeOverlaps(h19e,x,mode="IntersectionNotEmpty") RNA.sample.ID<-summarizeOverlaps(RNAs,x,mode="IntersectionNotEmpty") rm(x) counts.exon.sample.ID<-assay(exon.sample.ID) rm(exon.sample.ID) counts.RNA.sample.ID<-assay(RNA.sample.ID) rm(RNA.sample.ID) ###############counts obtained were bound into matrices with columns being samples and row being genes######### ############so i now have two matrices named RNA.raw and ex.raw containing the raw counts for each gene or exon######## ########calculating gene thresholds using the following formula############# ####threshold=(sum("gene lengths")/"read length")/mean("effective library size")*"gene lengths"/"read length")###### #########uses DEseq package to calculate "effective library size"################ treat<-factor(c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 12","Day 12","Day 19","Day 19","Day 19","Day 19","Day 26","Day 26","Day 33","Day 33","Day 49","Day 49","Day 63"," Day 63","Day 77","Day 77")) y<-DGEList(counts=RNA.raw,group=treat) y<-calcNormFactors(y) els<-y$samples$lib.size*y$samples$norm.factors thresholds<-(sum(genelengths)/50)/mean(els)*(genelengths/50) ########calculating exon thresholds using the following formula############# ####threshold=(sum("exon lengths")/"read length")/mean("effective library size")*"exon lengths"/"read length")###### #########uses DEseq package to calculate "effective library size"################ y<-h19e@ranges exonlengths<-y@width names(exonlengths)<-IDS.ex treat<-factor(c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 12","Day 12","Day 19","Day 19","Day 19","Day 19","Day 26","Day 26","Day 33","Day 33","Day 49","Day 49","Day 63"," Day 63","Day 77","Day 77")) y<-DGEList(counts=ex.raw,group=treat) y<-calcNormFactors(y) els<-y$samples$lib.size*y$samples$norm.factors thresholds.ex<-(sum(exonlengths)/50)/mean(els)*(exonlengths/50) #############normalize RNA.raw for visulaization and comparison purposes using DEseq package####################### bob<-data.frame(rownames=colnames(RNA.raw),condition=c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 12","Day 12","Day 19","Day 19","Day 19","Day 19","Day 26","Day 26","Day 33","Day 33","Day 49","Day 49","Day 63"," Day 63","Day 77","Day 77")) Sue<-newCountDataSet(RNA.raw,bob$condition) x<-estimateSizeFactors(Sue) norm.RNA.cts<-t(t(RNA.raw)/sizeFactors(x)) #######Calculate fold differences from threshold and max/min########### x<-apply(norm.red,1,max) FD<-x/thresholds z<-apply(norm.red,1,max) b<-apply(norm.red,1,min) FC<-z/b #######Check sample replicates using Pearson correlations######### ########print heatmapp################ y<-cor(norm.RNA.cts,method="pearson") colnames(y)<-c("Day 0-A","Day 0-B","Day 0-C","Day 0-D","Day 7-A","Day 7-B","Day 7-C","Day 7-D","Day 12-A","Day 12-B","Day 19-A","Day 19-B","Day 19-C","Day 19-D","Day 26-A","Day 26-B","Day 33-C","Day 33-D","Day 49-C","Day 49-D","Day 63-C","Day 63-D","Day 77-C","Day 77-D") rownames(y)<-c("Day 0-A","Day 0-B","Day 0-C","Day 0-D","Day 7-A","Day 7-B","Day 7-C","Day 7-D","Day 12-A","Day 12-B","Day 19-A","Day 19-B","Day 19-C","Day 19-D","Day 26-A","Day 26-B","Day 33-C","Day 33-D","Day 49-C","Day 49-D","Day 63-C","Day 63-D","Day 77-C","Day 77-D") heatmap(y,,Colv=NA,Rowv=NA,col=brewer.pal(6,"Blues")) postscript("Corr heatmap.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) heatmap(y,Colv=NA,Rowv=NA,col=brewer.pal(6,"Blues")) dev.off() ###########Check sample replicates using Pearson correlations comparing the two runs########### #############normalize genes and correlations for comparison of runs####################### blue<-cbind(RNA.raw[,1:8],RNA.raw[,11:14]) bob<-data.frame(rownames=colnames(blue),condition=c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 19","Day 19","Day 19","Day 19")) Sue<-newCountDataSet(blue,bob$condition) x<-estimateSizeFactors(Sue) norm.blue<-t(t(blue)/sizeFactors(x)) y<-cor(norm.blue,method="pearson") colnames(y)<-c("Day 0-C","Day 0-D","Day 0-A","Day 0-B","Day 7-C","Day 7-D","Day 7-A","Day 7-B","Day 19-C","Day 19-D","Day 19-A","Day 19-B") rownames(y)<-c("Day 0-C","Day 0-D","Day 0-A","Day 0-B","Day 7-C","Day 7-D","Day 7-A","Day 7-B","Day 19-C","Day 19-D","Day 19-A","Day 19-B") heatmap(y,Colv=NA,Rowv=NA,col=brewer.pal(6,"Blues")) postscript("Corr Run comp heatmap.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) heatmap(y,Colv=NA,Rowv=NA,col=brewer.pal(5,"Blues")) dev.off() ##########find genes with significant changes using edgeR package############# ############apply a 2 fold change cutoff############## ###########apply threshold to max value########### treat<-factor(c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 12","Day 12","Day 19","Day 19","Day 19","Day 19","Day 26","Day 26","Day 33","Day 33","Day 49","Day 49","Day 63"," Day 63","Day 77","Day 77")) y<-DGEList(counts=RNA.raw,group=treat) y<-calcNormFactors(y) times<-factor(c(1,1,1,1,2,2,2,2,3,3,4,4,4,4,5,5,6,6,7,7,8,8,9,9)) design<-model.matrix(~times) y<-estimateGLMCommonDisp(y,design) y<-estimateGLMTagwiseDisp(y,design,prior.n=90) fit<-glmFit(y,design) lrt<-glmLRT(y, fit) topTags(lrt) summary(dt<-decideTestsDGE(lrt)) eR.DE.ID<-rownames(y)[as.logical(dt)] eR.DEgenes<-norm.RNA.cts[eR.DE.ID,] de.pval<-p.adjust(lrt$table[,4],"fdr") names(de.pval)<-rownames(lrt$table) x<-which(de.pval<=0.05) y<-as.matrix(lrt$table) z<-y[intersect(names(x),rownames(y)),] fdr<-de.pval[x] FoldDiff<-FC[names(x)] edgeR.genes<-cbind(eR.DEgenes,FoldDiff,z,fdr) x<-which(FoldDiff>=2) edgeR.genes<-edgeR.genes[names(x),] y<-apply(norm.RNA.cts,1,max) x<-which(y>thresholds) z<-intersect(names(x),rownames(edgeR.genes)) edgeR.genes<-edgeR.genes[z,] ##############find genes with significant changes using DEseq2 package############### times<-c(0,0,0,0,7,7,7,7,12,12,19,19,19,19,26,26,33,33,49,49,63,63,77,77) times<-data.frame(times) Sue<-DESeqDataSetFromMatrix(RNA.raw,colData=times,design=~times) colData(Sue)$times<-factor(colData(Sue)$times,levels=c(0,7,12,19,26,33,49,63,77)) colnames(Sue)<-colnames(RNA.raw) Sue<-estimateSizeFactors(Sue) Sue<-estimateDispersions(Sue) Sue<-nbinomWaldTest(Sue,pAdjustMethod="fdr") x<-as.matrix(results(Sue)) x<-x[which(x[,5]<=0.05),] DEseq.genes<-cbind(norm.RNA.cts[rownames(x),],x) colnames(DEseq.genes)<-c(colnames(norm.RNA.cts),colnames(x)) #########Find union of deseq and edge genes########## #########to create a significant gene matrix############# x<-intersect(rownames(edgeR.genes),rownames(DEseq.genes)) sig.genes<-cbind(edgeR.genes[x,],DEseq.genes[x,29]) colnames(sig.genes)<-c(colnames(sig.genes[,1:29]),"EdgeR FDR","DEseq FDR") #########clean up R session############### rm(x) rm(y) rm(z) rm(fit) rm(fdr) rm(dt) rm(design) rm(de.pval) rm(FoldDiff) rm(lrt) rm(treat) rm(times) rm(eR.DEgenes) rm(eR.DE.ID) rm(bob) rm(Sue) rm(blue) #######use Mfuzz to uncover clusters of gene with similar expression############### #######possible shapes =2^(9-1)=256################################################ ########divided by 4 to give more genes per cluster =64############################ ########first created average cts from norm.RNA.cts for each timepoint########## ave.RNA.cts<-cbind(apply(norm.RNA.cts[,1:4],1,mean),apply(norm.RNA.cts[,5:8],1,mean),apply(norm.RNA.cts[,9:10],1,mean),apply(norm.RNA.cts[,11:14],1,mean),apply(norm.RNA.cts[,15:16],1,mean),apply(norm.RNA.cts[,17:18],1,mean),apply(norm.RNA.cts[,19:20],1,mean),apply(norm.RNA.cts[,21:22],1,mean),apply(norm.RNA.cts[,23:24],1,mean)) colnames(ave.RNA.cts)<-c("DO","D7","D12","D19","D26","D33","D49","D63","D77") x<-new("ExpressionSet",exprs=ave.RNA.cts[rownames(sig.genes),]) z<-standardise(x) cluster.obj<-mfuzz(z,64,1.25) yc<-cluster.obj[[1]] b<-cluster.obj[[3]] b<-sort(b) clusters<-foreach(i=1:64) %do% { x<-names(b[which(b==i)])} names(clusters)<-foreach(i=1:64,.combine='c') %dopar% {paste("Cluster ",i,sep="")} write.cl.pdf(cluster.obj,c(0,7,12,19,26,33,49,63,77),"First Clusters") ############################################################################### ##########Check cluster enrichments using above functions and datbases######### ############################################################################### #######Find diseases enriched in clusters########## ##########then write table################## disease.cluster<-dis.cluster.enrich(clusters,disease) x<-foreach(i=1:64,.combine='c') %dopar% {paste("Cluster ",i,sep="")} names(disease.cluster)<-x write.dis.cl(disease.cluster,"Disease First table") ######Find Diseases enriched in all significant genes#### #########then write table################### genlen<-genelengths[sort(names(h19eg))] z<-setdiff(names(h19eg),rownames(sig.genes)) zs<-rep(0,length(z)) xs<-rep(1,length(sig.genes[,1])) names(zs)<-z names(xs)<-rownames(sig.genes) zt<-c(zs,xs) zt<-zt[sort(names(h19eg))] pwf<-nullp(zt,bias.data=genlen) blue<-goseq(pwf,gene2cat=disease) FDR<-p.adjust(blue[,2],"fdr") green<-cbind(blue[which(FDR<=0.05),1:3],FDR[which(FDR<=0.05)]) all.disease<-green write.csv(all.disease,"all disease.csv") #######Find all GO categories and "GO:Biological Processes" categories enriched in clusters########## ##########then write tables################## GO.cluster<-go.cluster.enrich(clusters,c("GO:CC","GO:BP","GO:MF")) GO.BP.cluster<-go.cluster.enrich(clusters,"GO:BP") x<-foreach(i=1:64,.combine='c') %dopar% {paste("Cluster ",i,sep="")} names(GO.cluster)<-x names(GO.BP.cluster)<-x write.go.cl(GO.cluster,clusters,"GO clusters") write.go.cl(GO.BP.first.cluster,clusters,"BP GO clusters") #######Find all GO categories and "GO:Biological Processes" categories enriched in all significant genes########## ##########then write tables################## y<-names(h19eg) GO<-getgo(y,"hg19","knownGene") genlen<-genelengths[sort(y)] x<-intersect(y,rownames(sig.genes)) z<-setdiff(y,x) zs<-rep(0,length(z)) xs<-rep(1,length(x)) names(zs)<-z names(xs)<-x zt<-c(zs,xs) zt<-zt[sort(y)] pwf<-nullp(zt,bias.data=genlen) blue<-goseq(pwf,gene2cat=GO) FDR<-p.adjust(blue[,2],"fdr") green<-cbind(blue[which(FDR<=0.05),1:2],FDR[which(FDR<=0.05)]) all.GO.first<-green write.GO(all.GO.first,rownames(sig.genes),"ALL GO") y<-names(h19eg) GO<-getgo(y,"hg19","knownGene","GO:BP") genlen<-genelengths[sort(y)] x<-intersect(y,rownames(sig.genes)) z<-setdiff(y,x) zs<-rep(0,length(z)) xs<-rep(1,length(x)) names(zs)<-z names(xs)<-x zt<-c(zs,xs) zt<-zt[sort(y)] pwf<-nullp(zt,bias.data=genlen) blue<-goseq(pwf,gene2cat=GO) FDR<-p.adjust(blue[,2],"fdr") green<-cbind(blue[which(FDR<=0.05),1:2],FDR[which(FDR<=0.05)]) all.GO.BP<-green write.GO(all.GO.BP,rownames(sig.genes),"ALL GO BP") #######Find KEGG pathways enriched in clusters########## ##########then write table################## ########memory intensive so need to cut down number of cores being used####### #######return to 20 cores after analysis################# registerDoMC(cores=16) Kegg.cluster<-SPIAcluster(clusters) x<-foreach(i=1:64,.combine='c') %dopar% {paste("Cluster ",i,sep="")} names(Kegg.cluster)<-x registerDoMC(cores=20) ######Find Kegg pathways enriched in all significant genes#### #########then write table################### x<-FD[rownames(sig.genes)] z<-FC[rownames(sig.genes)] b<-which(z==Inf) blue<-replace(z,b,x[names(b)]) all.Kegg.first<-spia(de=blue,all=IDS,organism="hsa",nB=2000,verbose=FALSE) write.csv(all.Kegg.first,"All Kegg.csv") ########Look for enrichments in allen brain markers in each cluster############# mark.cluster<-dis.cluster.enrich(clusters,markers) x<-foreach(i=1:64,.combine='c') %dopar% {paste("Cluster ",i,sep="")} names(mark.cluster)<-x ######Analyzing clusters utilizing co-citation network for central genes######### net.cluster<-vital.nodes.cl(connect.gr,cluster.first,1) names(net.cluster)<-foreach(i=1:64,.combine='c') %dopar% {paste("Cluster ",i,sep="")} #########clean up R session############### rm(b) rm(w) rm(x) rm(y) rm(xs) rm(yc) rm(z) rm(zs) rm(zt) rm(green) rm(blue) rm(genlen) rm(i) rm(pwf) rm(GO) rm(FDR) ################################################################### ##############Analyze data for splicing changes#################### ################################################################### #############normalize ex.raw for visulaization and comparison purposes using DEseq package####################### IDS.ex<-h19e@elementMetadata$exon_id rownames(ex.raw)<-IDS.ex bob<-data.frame(rownames=colnames(ex.raw),condition=c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 12","Day 12","Day 19","Day 19","Day 19","Day 19","Day 26","Day 26","Day 33","Day 33","Day 49","Day 49","Day 63"," Day 63","Day 77","Day 77")) Sue<-newCountDataSet(ex.raw,bob$condition) x<-estimateSizeFactors(Sue) norm.ex.cts<-t(t(ex.raw)/sizeFactors(x)) ###############Use DEXSeq to uncover differential exon splicing############### y<-unique(names(exon.to.gene.ids)) x<-norm.ex.first[y,] y<-apply(x,1,max) z<-which(y>thresholds.ex[rownames(x)]) x<-ex.raw[z,] des<-c("Day 0","Day 0","Day 0","Day 0","Day 7","Day 7","Day 7","Day 7","Day 12","Day 12","Day 19","Day 19","Day 19","Day 19","Day 26","Day 26","Day 33","Day 33","Day 49","Day 49","Day 63"," Day 63","Day 77","Day 77") exonCounts<-newExonCountSet(x,des,exon.to.gene.ids[rownames(x)],rownames(x)) exonCounts<-estimateSizeFactors(exonCounts) exonCounts<-estimateDispersions(exonCounts,nCores=16,minCount=5,maxExon=100) exonCounts<-fitDispersionFunction(exonCounts) b<-rowMeans(counts(exonCounts)) plot(b, fData(exonCounts)$dispBeforeSharing, log = "xy", main="mean vs CR dispersion") x<-0.01:max(b) y<-exonCounts@dispFitCoefs[1] + exonCounts@dispFitCoefs[2]/x lines(x, y, col = "red") exonCounts<-testForDEU(exonCounts) exon.res<-DEUresultTable(exonCounts) table(exon.res$padjust<0.05) z<-norm.ex.first[as.character(exon.res[,2]),] x<-apply(z,1,max) y<-apply(z,1,min) blue<-x/y x<-which(poo>=2) y<-exon.res[x,] x<-which(y$padjust<0.05) z<-y[x,] exon.res<-z exon.res<-cbind(exon.res,blue[exon.res[,2]]) colnames(exon.res)<-c(colnames(exon.res[,1:6]),"MAX/MIN") x<-unique(exon.res$geneID) y<-rownames(sig.genes) sig.diff.exon<-norm.RNA.cts[intersect(x,y),] #########clean up R session############### rm(x) rm(y) rm(z) rm(blue) rm(bob) rm(Sue) ###########enrichment analysis of spliced genes########## exon.cluster<-foreach(i=1:64) %dopar% { x<-intersect(rownames(sig.diff.exon),clusters[[i]]) } names(exon.cluster)<-names(clusters) exonperclus<-foreach(i=1:64, .combine='c') %dopar% { x<-length(exon.cluster[[i]])/length(cluster[[i]]) x*100 } go.cluster.exon<-go.cluster.enrich(exon.cluster,c("GO:CC","GO:BP","GO:MF")) BP.go.cluster.exon<-go.cluster.enrich(exon.cluster,"GO:BP") write.go.cl(BP.go.cluster.exon,"BP go exon") disease.cluster.exon<-dis.cluster.enrich(exon.cluster,disease) names(disease.cluster.exon)<-names(exon.cluster) write.dis.cl(disease.cluster.exon,"Exon Diseases") kegg.exon<-SPIAcluster(exon.cluster) ########################################################### #############establishing stages############################## ###########run each cluster through a set of rules for each stage############ ##########1 = have a positive rise (for pluri a negative rise) or be a continuation of earlier peak#################### ##########2 = check check percentage of rise out of total range##################### ###########3 = have three people review the determined stages and rule if it fits or not######## ############################################################################################ #########make a matrix of diffeerences in rise between timepoints############## b<-cluster.obj[[1]] x<-cbind(b[,2]-b[,1],b[,3]-b[,2],b[,4]-b[,3],b[,5]-b[,4],b[,6]-b[,5],b[,7]-b[,6],b[,8]-b[,7],b[,9]-b[,8]) ######find pluirpotency stage genes########### y<-which(x[,1]<(-0.5)) ###gives 5 6 7 9 10 14 15 20 22 24 26 27 31 35 42 45 46 47 50 51 53 57 58 59 60 61 63##### m<-which(x[names(y),1]>-1) ########bottom four 6 10 45 46 #### #####kept 6 and 10####### ######ND stage########## y<-which(x[,1]>0.0) #### gives 1 2 3 4 8 11 12 13 16 17 18 19 21 23 25 28 29 30 32 33 34 36 37 38 39 40 41 43 44 48 49 52 54 55 56 62 64 #### z<-b[y,] z<-(apply(z,1,max)-apply(z,1,min)) which((x[y,1]/z)>0.10) ######## 2 3 4 12 13 16 18 19 21 23 25 28 29 32 33 34 36 37 38 40 41 43 44 48 49 52 55 56 64######### ##########bottom two left 28 and 49, kept 28###### ######CS stage########## m<-apply(b[,3:4],1,max) m<-m-b[,2] y<-m[which(m>0.1)] y1<-x[which(x[,3]>0.25),3] y2<-x[which(x[,2]>0.25),2] blue<-unique(c(names(y),names(y1),names(y2))) #### gives 1 2 3 4 8 9 10 11 12 14 16 17 18 21 23 24 26 28 30 32 33 34 36 38 40 42 43 44 45 46 49 51 53 54 55 56 57 58 59 60 61 62 63 #### m[blue] x[blue,2] x[blue,3] z<-b[names(y),] z<-(apply(z,1,max)-apply(z,1,min)) (y-b[names(y),2])/z z<-b[names(y1),] z<-(apply(z,1,max)-apply(z,1,min)) (y1-b[names(y1),3])/z z<-b[names(y2),] z<-(apply(z,1,max)-apply(z,1,min)) (y2-b[names(y2),2])/z #####reviewed and kept 1 2 3 4 8 9 10 12 14 17 18 21 24 26 28 30 32 33 34 36 38 40 42 43 44 45 46 49 51 53 54 55 56 57 61 62 63######### ####################Deep Layer stage################################### m<-apply(b[,5:7],1,max) m<-m-b[,2] y<-m[which(m>0.1)] y1<-x[which(x[,4]>0.25),4] y2<-x[which(x[,5]>0.25),5] y3<-x[which(x[,6]>0.25),6] blue<-unique(c(names(y),names(y1),names(y2),names(y3))) ######1 2 4 6 8 9 10 11 12 13 16 17 19 20 21 23 24 25 26 28 $29 30 32 33 34 ######### ######35 36 38 39 40 43 44 45 46 47 49 51 53 54 55 56 57 $58 59 60 $61 $62 63############## m[blue] x[blue,4] x[blue,5] x[blue,6] z<-b[names(y),] z<-(apply(z,1,max)-apply(z,1,min)) (y-b[names(y),4])/z z<-b[names(y1),] z<-(apply(z,1,max)-apply(z,1,min)) (y1-b[names(y1),4])/z z<-b[names(y2),] z<-(apply(z,1,max)-apply(z,1,min)) (y2-b[names(y2),5])/z z<-b[names(y3),] z<-(apply(z,1,max)-apply(z,1,min)) (y3-b[names(y3),6])/z ######kept 2 4 6 11 16 17 19 20 21 23 24 25 26 30 32 33 36 38 43 44 45 46 47 49 51 53 54 55 57 59 60 added 3 ############## ####################Upper Layer stage################################### m<-apply(b[,8:9],1,max) m<-m-b[,2] y<-m[which(m>0.1)] y1<-x[which(x[,7]>0.1),7] y2<-x[which(x[,8]>0.1),8] blue<-unique(c(names(y),names(y1),names(y2))) #######1 2 $3 4 $5 7 8 $11 $12 13 16 17 $18 19 23 24 28 29 30 $32 $33 $34 35 $36 38 39 40 43 $44 45 $49 $52######### #######53 $54 $55 56 57 58 $59 $60 61 62###################### m[blue] x[blue,7] x[blue,8] z<-b[names(y),] z<-(apply(z,1,max)-apply(z,1,min)) (y-b[names(y),7])/z z<-b[names(y1),] z<-(apply(z,1,max)-apply(z,1,min)) (y1-b[names(y1),7])/z z<-b[names(y2),] z<-(apply(z,1,max)-apply(z,1,min)) (y2-b[names(y2),8])/z ####### kept 1 2 4 7 8 13 16 17 19 23 24 28 29 30 35 38 39 40 43 45 53 56 57 58 61 62###################### #####create weieghted averages of stages to check them############### #######group clusters from each stage into how many stages each cluster belongs to############# #######weighted ave. = sum("correction factor"*"sizes of clusters"* "cluster center at each timepoint")/sum("sizes of clusters").../4 ###### ########correction factor =1-(membership-1)############# x<-cluster.obj[[1]] y<-cluster.obj[[2]] pl4<-c(53,57) pl2<-c(6,7,9,10,20,35,42,47,59,60,63) pl3<-c(26,51) pl1<-c(5,15,22,27,31,50) pluri<-foreach(i=1:9,.combine='c') %dopar% { xp1<-x[pl1,] xp2<-x[pl2,] xp3<-x[pl3,] xp4<-x[pl4,] yp1<-y[pl1] yp2<-y[pl2] yp3<-y[pl3] yp4<-y[pl4] (sum(yp1*xp1[,i])/sum(yp1)+sum(0.75*yp2*xp2[,i])/sum(yp2)+sum(0.5*yp3*xp3[,i])/sum(yp3)+sum(0.25*yp4*xp4[,i])/sum(yp4))/4} plot(c(0,7,12,19,26,33,49,63,77),pluri,"line") pl4<-c(2,4,38,43) pl3<-c(3,16,19,21,23,32,33,36,40,44,49,55,56) pl2<-c(12,13,18,25,29,34) pl1<-c(37,41,48,52,64) nd<-foreach(i=1:9,.combine='c') %dopar% { xp1<-x[pl1,] xp2<-x[pl2,] xp3<-x[pl3,] xp4<-x[pl4,] yp1<-y[pl1] yp2<-y[pl2] yp3<-y[pl3] yp4<-y[pl4] (sum(yp1*xp1[,i])/sum(yp1)+sum(0.75*yp2*xp2[,i])/sum(yp2)+sum(0.5*yp3*xp3[,i])/sum(yp3)+sum(0.25*yp4*xp4[,i])/sum(yp4))/4} plot(c(0,7,12,19,26,33,49,63,77),nd,"line") pl4<-c(2,4,38,43,53,57) pl3<-c(3,17,21,24,26,30,33,36,40,44,45,49,51,55,56) pl2<-c(1,8,9,10,12,18,28,34,42,46,54,61,62,63) pl1<-c(14) cs<-foreach(i=1:9,.combine='c') %dopar% { xp1<-x[pl1,] xp2<-x[pl2,] xp3<-x[pl3,] xp4<-x[pl4,] yp1<-y[pl1] yp2<-y[pl2] yp3<-y[pl3] yp4<-y[pl4] (sum(yp1*xp1[i])/sum(yp1)+sum(0.75*yp2*xp2[,i])/sum(yp2)+sum(0.5*yp3*xp3[,i])/sum(yp3)+sum(0.25*yp4*xp4[,i])/sum(yp4))/4} plot(c(0,7,12,19,26,33,49,63,77),cs,"line") pl4<-c(2,4,38,43,53,57) pl3<-c(3,16,17,19,21,23,24,26,30,32,33,36,44,45,49,51,55) pl2<-c(6,20,25,46,47,54,59,60) pl1<-c(11) dl<-foreach(i=1:9,.combine='c') %dopar% { xp1<-x[pl1,] xp2<-x[pl2,] xp3<-x[pl3,] xp4<-x[pl4,] yp1<-y[pl1] yp2<-y[pl2] yp3<-y[pl3] yp4<-y[pl4] (sum(yp1*xp1[i])/sum(yp1)+sum(0.75*yp2*xp2[,i])/sum(yp2)+sum(0.5*yp3*xp3[,i])/sum(yp3)+sum(0.25*yp4*xp4[,i])/sum(yp4))/4} plot(c(0,7,12,19,26,33,49,63,77),dl,"line") pl4<-c(2,4,38,43,53,57) pl3<-c(16,17,19,23,24,30,40,45,56) pl2<-c(1,7,8,13,18,28,29,35,42,46,61,62) pl1<-c(39,58) ul<-foreach(i=1:9,.combine='c') %dopar% { xp1<-x[pl1,] xp2<-x[pl2,] xp3<-x[pl3,] xp4<-x[pl4,] yp1<-y[pl1] yp2<-y[pl2] yp3<-y[pl3] yp4<-y[pl4] (sum(yp1*xp1[,i])/sum(yp1)+sum(0.75*yp2*xp2[,i])/sum(yp2)+sum(0.5*yp3*xp3[,i])/sum(yp3)+sum(0.25*yp4*xp4[,i])/sum(yp4))/4} plot(c(0,7,12,19,26,33,49,63,77),ul,"line") w.centers<-rbind(pluri,nd,cs,dl,ul) write.csv(w.centers,"weighted centers.csv") Stage.names<-c("Pluirpotency","Neural Diff", "Cortical Specification","Deep layers","Upper Layers") pdf("Ave Centers.pdf",onefile=T,width=11) for(i in 1:5) { z<-Stage.names[i] plot(c(0,7,12,19,26,33,49,63,77),w.centers[i,], type="line",col="darkblue",xlab="Days post Induction",ylab="Relative Change in Expression",main=z)} dev.off() ################set up stages list################ Stages<-list(unique(c(cluster.first[[5]],cluster.first[[6]],cluster.first[[7]],cluster.first[[9]],cluster.first[[10]],cluster.first[[15]],cluster.first[[20]],cluster.first[[22]],cluster.first[[26]],cluster.first[[27]],cluster.first[[31]],cluster.first[[35]],cluster.first[[42]],cluster.first[[47]],cluster.first[[50]],cluster.first[[51]],cluster.first[[53]],cluster.first[[57]],cluster.first[[59]],cluster.first[[60]],cluster.first[[63]])), unique(c(cluster.first[[2]],cluster.first[[3]],cluster.first[[4]],cluster.first[[12]],cluster.first[[13]],cluster.first[[16]],cluster.first[[18]],cluster.first[[19]],cluster.first[[21]],cluster.first[[23]],cluster.first[[25]],cluster.first[[29]],cluster.first[[32]],cluster.first[[33]],cluster.first[[34]],cluster.first[[36]],cluster.first[[37]],cluster.first[[38]],cluster.first[[40]],cluster.first[[41]],cluster.first[[43]],cluster.first[[44]],cluster.first[[48]],cluster.first[[49]],cluster.first[[52]],cluster.first[[55]],cluster.first[[56]],cluster.first[[64]])), unique(c(cluster.first[[1]],cluster.first[[2]],cluster.first[[3]],cluster.first[[4]],cluster.first[[8]],cluster.first[[9]],cluster.first[[10]],cluster.first[[12]],cluster.first[[14]],cluster.first[[17]],cluster.first[[18]],cluster.first[[21]],cluster.first[[24]],cluster.first[[26]],cluster.first[[28]],cluster.first[[30]],cluster.first[[32]],cluster.first[[33]],cluster.first[[34]],cluster.first[[36]],cluster.first[[38]],cluster.first[[40]],cluster.first[[42]],cluster.first[[43]],cluster.first[[44]],cluster.first[[45]],cluster.first[[46]],cluster.first[[49]],cluster.first[[51]],cluster.first[[53]],cluster.first[[54]],cluster.first[[55]],cluster.first[[56]],cluster.first[[57]],cluster.first[[61]],cluster.first[[62]],cluster.first[[63]])), unique(c(cluster.first[[2]],cluster.first[[3]],cluster.first[[4]],cluster.first[[6]],cluster.first[[11]],cluster.first[[16]],cluster.first[[17]],cluster.first[[19]],cluster.first[[20]],cluster.first[[21]],cluster.first[[23]],cluster.first[[24]],cluster.first[[25]],cluster.first[[26]],cluster.first[[30]],cluster.first[[32]],cluster.first[[33]],cluster.first[[36]],cluster.first[[38]],cluster.first[[43]],cluster.first[[44]],cluster.first[[45]],cluster.first[[46]],cluster.first[[47]],cluster.first[[49]],cluster.first[[51]],cluster.first[[53]],cluster.first[[54]],cluster.first[[55]],cluster.first[[57]],cluster.first[[59]],cluster.first[[60]])), unique(c(cluster.first[[1]],cluster.first[[2]],cluster.first[[4]],cluster.first[[7]],cluster.first[[8]],cluster.first[[13]],cluster.first[[16]],cluster.first[[17]],cluster.first[[19]],cluster.first[[23]],cluster.first[[24]],cluster.first[[28]],cluster.first[[29]],cluster.first[[30]],cluster.first[[35]],cluster.first[[38]],cluster.first[[39]],cluster.first[[40]],cluster.first[[43]],cluster.first[[45]],cluster.first[[53]],cluster.first[[56]],cluster.first[[57]],cluster.first[[58]],cluster.first[[61]],cluster.first[[62]]))) names(Stages)<-Stage.names #############stages with exon changing genes########### Stages.ex<-foreach(i=1:5) %dopar% { intersect(rownames(sig.diff.exon),Stages[[i]])} ######clean up R Session#### rm(b) rm(x) rm(z) rm(y) rm(y1) rm(y2) rm(y3) rm(blue) rm(xp1) rm(xp2) rm(xp3) rm(xp4) rm(yp1) rm(yp2) rm(yp3) rm(yp4) rm(ul) rm(dl) rm(cs) rm(nd) rm(pluri) rm(pl1) rm(pl2) rm(pl3) rm(pl4) #################Enrichment analysis with stages for Disease, GO, and Kegg databases########### stage.disease<-dis.cluster.enrich(Stages,disease) names(stage.disease)<-Stage.names stage.disease.ex<-dis.cluster.enrich(Stages.ex,disease) names(stage.disease.ex)<-Stage.names stage.go<-go.cluster.enrich(Stages,c("GO:CC","GO:BP","GO:MF")) names(stage.go)<-Stage.names stage.go.BP<-go.cluster.enrich(Stages,"GO:BP") names(stage.go.BP)<-Stage.names stage.go.ex<-go.cluster.enrich(Stages.ex,c("GO:CC","GO:BP","GO:MF")) names(stage.go)<-Stage.names stage.go.BP.ex<-go.cluster.enrich(Stages.ex,"GO:BP") names(stage.go.BP)<-Stage.names kegg.stages<-SPIAcluster(Stages) names(kegg.stages)<-Stage.names ############calculating z score from p values for KEGG.stages################### write.csv(cbind(all.Kegg.first,qnorm(1-(all.Kegg.first[,10]/2))),"Kegg table Zscore.csv") ###########write tables for stage enrichments############## write.dis.cl(stage.disease,"Stage Diseases") write.dis.cl(stage.disease.ex,"Stage Diseases exon") write.go.stage(stage.go.BP,Stages,"GO BP stages") write.go.stage(stage.go.BP.ex,Stages,"GO BP stages exon") write.kegg.stag(kegg.stages,"Kegg stages") ########Analyzing stages utilizing co-citation network for central genes########### net.stages<-vital.nodes.cl(connect.gr,Stages,1) names(net.stages)<-Stage.names ################################################### ###############making figures##################### ################################################## ###########Kegg cluster panel############ blue<-read.delim("Kegg clusters for fig.csv",row.names=1) blue<-data.frame(blue) p<-ggplot(blue,aes(Cluster,Name,colour=Color,shape=Status)) p<-p + scale_colour_manual(name="Color",values=c("Other"="black","Immune and Infectious Disease"="red3","Cancer"="purple","Canonical Signaling"="green2","Cell Processes"="orange","Nervous System"="blue3")) p<-p +layer(geom="point",size=3) p<-p + ylab("") p<-p +scale_x_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60,64),limits=c(0,64)) p<-p + theme(panel.grid.minor=element_line(color="white",size=1)) p<-p + theme(axis.text=element_text(size="10", color="black")) p<-p + theme(panel.grid.minor=element_line(color="grey90",size=0.5)) p<-p + theme(panel.grid.major=element_line(color="grey90",size=1)) p<-p + theme(axis.title.x=element_text(size="12", color="black")) p<-p + theme(legend.text=element_text(size="12", color="black")) p<-p + theme(legend.title=element_text(size="12", color="black")) p<-p + theme(panel.background=element_rect(fill="ivory")) p<-p + theme(panel.background=element_rect(size=0.5)) p<-p + theme(panel.background=element_rect(colour="grey36")) postscript("KEGG clusters.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) plot(p) dev.off() ########Disease cluster panel########## blue<-read.delim("Disease cluster for fig.csv",row.names=1) blue<-data.frame(blue) p<-ggplot(blue,aes(Cluster,Disease,colour=Color)) p<-p + scale_colour_manual(name="Color",values=c("Other "="black","Cancer"="red3","Eye Diseases"="purple","Nervous System Diseases"="blue")) p<-p +layer(geom="point",size=3) p<-p + ylab("") p<-p +scale_x_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60,64),limits=c(0,64)) p<-p + theme(panel.grid.minor=element_line(color="white",size=1)) p<-p + theme(axis.text=element_text(size="10", color="black")) p<-p + theme(panel.grid.minor=element_line(color="grey90",size=0.5)) p<-p + theme(panel.grid.major=element_line(color="grey90",size=1)) p<-p + theme(axis.title.x=element_text(size="12", color="black")) p<-p + theme(legend.text=element_text(size="12", color="black")) p<-p + theme(legend.title=element_text(size="12", color="black")) p<-p + theme(panel.background=element_rect(fill="ivory")) p<-p + theme(panel.background=element_rect(size=0.5)) p<-p + theme(panel.background=element_rect(colour="grey36")) postscript("Disease clusters.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) plot(p) dev.off() #####GO:BP cluster panel####### x<-as.matrix(read.csv("Goslim for fig.csv")) blue<-foreach(i=1:length(GO.BP.cluster), .combine='rbind') %dopar% { x<-GO.BP.cluster[[i]] if (dim(x)[1]==0){}else{ y<-GOnames[x[,1]] names(y)<-rownames(x) cluster<-rep(i,dim(x)[1]) cbind(y,x,cluster)}} y<-intersect(x[,1],blue[,2]) z<-x[,2] names(z)<-x[,1] blue<-foreach(i=1:length(y),.combine='rbind') %dopar% { x<-blue[which(blue[,2]==y[i]),] Group<-z[y[i]] cbind(x,Group) } blue<-data.frame(blue) p<-ggplot(blue,aes(cluster,category,colour=Group)) p<-p + scale_colour_manual(name="Color",values=c("Other Developmental Processes"="black","Immune system development"="purple","Circulatory system development"="red3","Embryo development"="green2","Urogenital system development"="gold","Nervous system development"="blue3", "CNS development"="lightblue1")) p<-p +layer(geom="point",size=3) p<-p + ylab("") p<-p +scale_x_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60,64),limits=c(0,64)) p<-p + theme(panel.grid.minor=element_line(color="white",size=1)) p<-p + theme(axis.text=element_text(size="10", color="black")) p<-p + theme(panel.grid.minor=element_line(color="grey90",size=0.5)) p<-p + theme(panel.grid.major=element_line(color="grey90",size=1)) p<-p + theme(axis.title.x=element_text(size="12", color="black")) p<-p + theme(legend.text=element_text(size="12", color="black")) p<-p + theme(legend.title=element_text(size="12", color="black")) p<-p + theme(panel.background=element_rect(fill="ivory")) p<-p + theme(panel.background=element_rect(size=0.5)) p<-p + theme(panel.background=element_rect(colour="grey36")) postscript("GO clusters.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) plot(p) dev.off() #######Exon Disease clusters panel########### blue<-read.delim("Exon Diseases.csv",row.names=1) blue<-data.frame(blue) p<-ggplot(blue,aes(Cluster,Disease,colour=Color)) p<-p + scale_colour_manual(name="Color",values=c("Other"="black","Cancer"="red3","Eye Diseases"="purple","Nervous System Diseases"="blue")) p<-p +layer(geom="point",size=3) p<-p + ylab("") p<-p +scale_x_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60,64),limits=c(0,64)) p<-p + theme(panel.grid.minor=element_line(color="white",size=1)) p<-p + theme(axis.text=element_text(size="10", color="black")) p<-p + theme(panel.grid.minor=element_line(color="grey90",size=0.5)) p<-p + theme(panel.grid.major=element_line(color="grey90",size=1)) p<-p + theme(axis.title.x=element_text(size="12", color="black")) p<-p + theme(legend.text=element_text(size="12", color="black")) p<-p + theme(legend.title=element_text(size="12", color="black")) p<-p + theme(panel.background=element_rect(fill="ivory")) p<-p + theme(panel.background=element_rect(size=0.5)) p<-p + theme(panel.background=element_rect(colour="grey36")) postscript("Disease clusters exon.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) plot(p) dev.off() ############GO:BP exon cluster panel################ x<-foreach(i=1:64,.combine='rbind') %dopar% { x<-BP.go.cluster.exon[[i]] if(dim(x)[1]>=1) { z<-(rep(i,dim(x)[1])) cbind(x,z)} else{} } names(x)<-c(names(BP.go.cluster.exon),"cluster") blue<-data.frame(x) blue<-cbind(blue,GOnames[blue[,1]]) colnames(blue)<-c("GO","p value","fdr","Cluster","Name") blue<-data.frame(blue) p<-ggplot(blue,aes(Cluster,Name)) p<-p +layer(geom="point",size=3) p<-p + ylab("") p<-p +scale_x_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60,64),limits=c(0,64)) p<-p + theme(panel.grid.minor=element_line(color="white",size=1)) p<-p + theme(axis.text=element_text(size="10", color="black")) p<-p + theme(panel.grid.minor=element_line(color="grey90",size=0.5)) p<-p + theme(panel.grid.major=element_line(color="grey90",size=1)) p<-p + theme(axis.title.x=element_text(size="12", color="black")) p<-p + theme(legend.text=element_text(size="12", color="black")) p<-p + theme(legend.title=element_text(size="12", color="black")) p<-p + theme(panel.background=element_rect(fill="ivory")) p<-p + theme(panel.background=element_rect(size=0.5)) p<-p + theme(panel.background=element_rect(colour="grey36")) postscript("GO clusters exon.eps", horizontal = FALSE, onefile = TRUE, paper = "special", height=11,width=8.5) plot(p) dev.off() #############take autism genes and make a graph file (.gml)###### b<-driven("Autism",rownames(sig.genes)) b<-intersect(rownames(connect),b) x<-induced.subgraph(connect.gr,b) V(x)$label<-SYMBOLS[V(x)$name] m<-V(x)$name z<-intersect(m,Diffstages.st[[5]]) b<-foreach(i=1:length(z), .combine=c) %dopar% {which(m==z[i])} m<-replace(m,b,"UL") z<-intersect(m,Diffstages.st[[4]]) b<-foreach(i=1:length(z), .combine=c) %dopar% {which(m==z[i])} m<-replace(m,b,"DL") z<-intersect(m,Diffstages.st[[3]]) b<-foreach(i=1:length(z), .combine=c) %dopar% {which(m==z[i])} m<-replace(m,b,"CS") z<-intersect(m,Diffstages.st[[2]]) b<-foreach(i=1:length(z), .combine=c) %dopar% {which(m==z[i])} m<-replace(m,b,"ND") z<-intersect(m,Diffstages.st[[1]]) b<-foreach(i=1:length(z), .combine=c) %dopar% {which(m==z[i])} m<-replace(m,b,"Pl") z<-V(x)$name z<-intersect(m,z) b<-foreach(i=1:length(z), .combine=c) %dopar% {which(m==z[i])} m<-replace(m,b,"Multi") V(x)$color<-m write.graph(x,"autism.GML","gml")