setwd("~/1TB/R-package/") options(stringsAsFactors=FALSE) library("stringr");library(XML);library(igraph) #################### ## Get dependencies #################### getDependencies = function(pkg_name) { url_st = "http://cran.r-project.org/web/packages" url_end = "index.html" url = paste(url_st, pkg_name, url_end, sep="/") cran_web = paste(readLines(url), collapse="") if(regexpr("Depends:", cran_web) == -1) return() ## Get the table hrefs = gsub('(.*Depends:)',"", cran_web) ## Clean the td & tr tags hrefs = gsub('.*',"", hrefs) ## Remove R from dependencies hrefs = gsub('R .*?<',"<", hrefs) ## Remove versions hrefs = gsub("\\(&[ge; 0-9\\.\\-]*)", "", hrefs) ## Remove Bioconductor hrefs = gsub("([A-Za-z0-9\\.]*)", "", hrefs) ## Remove Omegahat hrefs = gsub("[A-Za-z0-9]*", "", hrefs) ## Get dependencies depends_on = gsub("[0-9A-Za-z\\.]*", "\\1", hrefs) ##Unlist and remove white space depends_on = strsplit(depends_on, ",")[[1]] depends_on = as.vector(sapply(depends_on, str_trim)) depends_on = depends_on[sapply(depends_on, nchar)>0] return(depends_on) } getupdates = function(pkg_name) { url_st = "http://cran.r-project.org/src/contrib/Archive" url = paste(url_st, pkg_name,"",sep="/"); tab=getURL(url); tab=readHTMLTable(tab, stringsAsFactors = F) if(length(tab)==0) { url_st = "http://cran.r-project.org/web/packages" url_end = "index.html" url = paste(url_st, pkg_name, url_end, sep="/") cran_web = paste(readLines(url), collapse="") if(regexpr("Published:", cran_web) == -1) return() ## Get the table hrefs = gsub('(.*Published:)',"", cran_web) ## Clean the td & tr tags start = as.Date(gsub('.*',"", hrefs));ver_num=1 }else{ tab=as.data.frame(na.omit(readHTMLTable(url)[[1]]));tab=tab[-1,] ver_num=nrow(tab) start=as.Date(strsplit(tab[1,'Last modified'],' ')[[1]][1],"%d-%b-%Y")#as.numeric(unlist(strsplit(strsplit(tab[1,'Last modified'],' ')[[1]][1],'-'))[3]) #last=as.Date(strsplit(tab[ver_num,'Last modified'],' ')[[1]][1],"%d-%b-%Y")#as.numeric(unlist(strsplit(strsplit(tab[ver_num,'Last modified'],' ')[[1]][1],'-'))[3]) } days=as.numeric(Sys.Date()-start)+1 if(days<32) ur=NA else ur=ver_num/days return(ur) } #Introduction to Functional Dependency Network Analysis (FDNA), https://esd.mit.edu/symp09/presentations/day3.session4c.garvey.pdf tab=readHTMLTable('http://cran.r-project.org/web/packages/available_packages_by_name.html');x=tab[[1]] cran_packages=read.table('CRAN_R_package_updated_20140909.txt');cran_packages=cran_packages[cran_packages!=""] #unlist(strsplit(depends_on, " ")) packages_ur=data.frame(package=cran_packages,ur=0) for(i in 1:nrow(packages_ur)){packages_ur[i,'ur']=getupdates(packages_ur[i,'package']);cat(i, ":", packages_ur[i,'ur'], "\n");} write.table(packages_ur,file='data/CRAN_package_update_rate.txt',row.names=F,sep='\t',quote=F) packages_ur=read.table('data/CRAN_package_update_rate.txt',header=T) from = vector("character", 10000) to = vector("character", 10000) j = 1 for(i in 1:length(cran_packages)) { dependencies = getDependencies(cran_packages[i]) cat(i, ":", dependencies, "\n") if(!is.null(dependencies) && length(dependencies) > 0) { l = length(dependencies) - 1 from[j:(j+l)] = cran_packages[i] to[j:(j+l)] = dependencies j = j + l + 1 } } dep_df = data.frame(from=from, to=to) dep_df = dep_df[1:j,] write.table(dep_df,file='CRAN_R_package_dependency_network_20140909.txt',row.names=F,quote=F,sep='\t') dep_df=read.table('data/CRAN_R_package_dependency_network_20140909.txt',header=T) packages_ur=read.table('data/CRAN_package_update_rate.txt',header=T) packages_ur=packages_ur[packages_ur$ur<1/30,] dep_df=dep_df[!dep_df$to%in%c('R','methods'),] # %in% find patterns listed on the left in the right el=na.omit(cbind(match(dep_df[,1],packages_ur[,1]),match(dep_df[,2],packages_ur[,1]))) g=graph.edgelist(as.matrix(el)) #V(g)$size=1+2*log(graph.strength(g)) #rbPal=colorRampPalette(c('yellow','red')); #V(g)$color=ifelse(packages_ur$ur>0.002,'red','yellow')# #V(g)$color=rbPal(3)[as.numeric(cut(rank(packages_ur$ur),breaks =3))] V(g)$color='white';V(g)$color[packages_ur$ur>0.01]='red';V(g)$color[packages_ur$ur<0.002]='yellow' V(g)$size=1;V(g)$size[packages_ur$ur>0.01]=2;V(g)$size[packages_ur$ur<0.002]=2.5 degin=degree(g,mode='in');#degin=betweenness(g) cor.test(degin[degin>0],packages_ur$ur[degin>0],method='s') degtot=degree(g,mode='total') new_g=induced.subgraph(g,vids=which(degtot>0));table(V(new_g)$color) #layout_spring=layout.spring(new_g) pdf('R_package_indegree_vs_updaterate2.pdf') #plot(x=degin[degin>0],y=packages_ur$ur[degin>0],log='xy',lwd=2,pch=1,col='blue2',main='spearman cor=0.34, p<2.2e-16',xlab='in-degree',ylab='update rate per day') plot(x=degin[degin>0],y=packages_ur$ur[degin>0],log='xy',lwd=2,pch=1,col='blue2',las=1,xlab='',ylab='') dev.off() #layout_spring=layout.spring(new_g) pdf('R_package_dependency_network_spring_lightgrey4.pdf') plot.igraph(new_g,vertex.color=V(new_g)$color,vertex.label=NA,vertex.size=V(new_g)$size,vertex.frame.color=240,#vertex.frame.width=0.02, layout=layout.spring#layout.fruchterman.reingold# ,edge.arrow.size=0.01,edge.color=240,edge.width=0.03)#layout=layout.fruchterman.reingold, #legend('bottomright',c('Frequently updated packages','Slowly updated packages','Others'),pch=21,pt.bg=c('red','yellow','white'),bty='n',cex=1,horiz=F) dev.off() # ########### # #Main Page, http://csgillespie.wordpress.com/2011/03/23/graphical-display-of-r-package-dependencies/ # url = "http://cran.r-project.org/web/packages/" # cran_web_page = paste(readLines(url), collapse="") # # main_table = gsub('.*(.*)
.*', "\\1", cran_web_page) # main_table = gsub('', "", main_table) # # depends_on = # gsub('[0-9A-Za-z\\.]*.*?', # "\\1 ", main_table)