setwd("~/1TB/R-package/")
options(stringsAsFactors=FALSE)
library("stringr");library(XML);library(igraph)
####################
## Get dependencies
####################
getDependencies = function(pkg_name) {
url_st = "http://cran.r-project.org/web/packages"
url_end = "index.html"
url = paste(url_st, pkg_name, url_end, sep="/")
cran_web = paste(readLines(url), collapse="")
if(regexpr("
Depends: | ", cran_web) == -1)
return()
## Get the table
hrefs = gsub('(.* | Depends: | )',"", cran_web)
## Clean the td & tr tags
hrefs = gsub(' | .*',"", hrefs)
## Remove R from dependencies
hrefs = gsub('R .*?<',"<", hrefs)
## Remove versions
hrefs = gsub("\\(&[ge; 0-9\\.\\-]*)", "", hrefs)
## Remove Bioconductor
hrefs = gsub("([A-Za-z0-9\\.]*)",
"", hrefs)
## Remove Omegahat
hrefs =
gsub("[A-Za-z0-9]*",
"", hrefs)
## Get dependencies
depends_on = gsub("[0-9A-Za-z\\.]*", "\\1", hrefs)
##Unlist and remove white space
depends_on = strsplit(depends_on, ",")[[1]]
depends_on = as.vector(sapply(depends_on, str_trim))
depends_on = depends_on[sapply(depends_on, nchar)>0]
return(depends_on)
}
getupdates = function(pkg_name) {
url_st = "http://cran.r-project.org/src/contrib/Archive"
url = paste(url_st, pkg_name,"",sep="/"); tab=getURL(url);
tab=readHTMLTable(tab, stringsAsFactors = F)
if(length(tab)==0) {
url_st = "http://cran.r-project.org/web/packages"
url_end = "index.html"
url = paste(url_st, pkg_name, url_end, sep="/")
cran_web = paste(readLines(url), collapse="")
if(regexpr("Published: | ", cran_web) == -1) return()
## Get the table
hrefs = gsub('(.* | Published: | )',"", cran_web)
## Clean the td & tr tags
start = as.Date(gsub(' | .*',"", hrefs));ver_num=1
}else{
tab=as.data.frame(na.omit(readHTMLTable(url)[[1]]));tab=tab[-1,]
ver_num=nrow(tab)
start=as.Date(strsplit(tab[1,'Last modified'],' ')[[1]][1],"%d-%b-%Y")#as.numeric(unlist(strsplit(strsplit(tab[1,'Last modified'],' ')[[1]][1],'-'))[3])
#last=as.Date(strsplit(tab[ver_num,'Last modified'],' ')[[1]][1],"%d-%b-%Y")#as.numeric(unlist(strsplit(strsplit(tab[ver_num,'Last modified'],' ')[[1]][1],'-'))[3])
}
days=as.numeric(Sys.Date()-start)+1
if(days<32) ur=NA else ur=ver_num/days
return(ur)
}
#Introduction to Functional Dependency Network Analysis (FDNA), https://esd.mit.edu/symp09/presentations/day3.session4c.garvey.pdf
tab=readHTMLTable('http://cran.r-project.org/web/packages/available_packages_by_name.html');x=tab[[1]]
cran_packages=read.table('CRAN_R_package_updated_20140909.txt');cran_packages=cran_packages[cran_packages!=""] #unlist(strsplit(depends_on, " "))
packages_ur=data.frame(package=cran_packages,ur=0)
for(i in 1:nrow(packages_ur)){packages_ur[i,'ur']=getupdates(packages_ur[i,'package']);cat(i, ":", packages_ur[i,'ur'], "\n");}
write.table(packages_ur,file='data/CRAN_package_update_rate.txt',row.names=F,sep='\t',quote=F)
packages_ur=read.table('data/CRAN_package_update_rate.txt',header=T)
from = vector("character", 10000)
to = vector("character", 10000)
j = 1
for(i in 1:length(cran_packages)) {
dependencies = getDependencies(cran_packages[i])
cat(i, ":", dependencies, "\n")
if(!is.null(dependencies) &&
length(dependencies) > 0) {
l = length(dependencies) - 1
from[j:(j+l)] = cran_packages[i]
to[j:(j+l)] = dependencies
j = j + l + 1
}
}
dep_df = data.frame(from=from, to=to)
dep_df = dep_df[1:j,]
write.table(dep_df,file='CRAN_R_package_dependency_network_20140909.txt',row.names=F,quote=F,sep='\t')
dep_df=read.table('data/CRAN_R_package_dependency_network_20140909.txt',header=T)
packages_ur=read.table('data/CRAN_package_update_rate.txt',header=T)
packages_ur=packages_ur[packages_ur$ur<1/30,]
dep_df=dep_df[!dep_df$to%in%c('R','methods'),] # %in% find patterns listed on the left in the right
el=na.omit(cbind(match(dep_df[,1],packages_ur[,1]),match(dep_df[,2],packages_ur[,1])))
g=graph.edgelist(as.matrix(el))
#V(g)$size=1+2*log(graph.strength(g))
#rbPal=colorRampPalette(c('yellow','red'));
#V(g)$color=ifelse(packages_ur$ur>0.002,'red','yellow')#
#V(g)$color=rbPal(3)[as.numeric(cut(rank(packages_ur$ur),breaks =3))]
V(g)$color='white';V(g)$color[packages_ur$ur>0.01]='red';V(g)$color[packages_ur$ur<0.002]='yellow'
V(g)$size=1;V(g)$size[packages_ur$ur>0.01]=2;V(g)$size[packages_ur$ur<0.002]=2.5
degin=degree(g,mode='in');#degin=betweenness(g)
cor.test(degin[degin>0],packages_ur$ur[degin>0],method='s')
degtot=degree(g,mode='total')
new_g=induced.subgraph(g,vids=which(degtot>0));table(V(new_g)$color)
#layout_spring=layout.spring(new_g)
pdf('R_package_indegree_vs_updaterate2.pdf')
#plot(x=degin[degin>0],y=packages_ur$ur[degin>0],log='xy',lwd=2,pch=1,col='blue2',main='spearman cor=0.34, p<2.2e-16',xlab='in-degree',ylab='update rate per day')
plot(x=degin[degin>0],y=packages_ur$ur[degin>0],log='xy',lwd=2,pch=1,col='blue2',las=1,xlab='',ylab='')
dev.off()
#layout_spring=layout.spring(new_g)
pdf('R_package_dependency_network_spring_lightgrey4.pdf')
plot.igraph(new_g,vertex.color=V(new_g)$color,vertex.label=NA,vertex.size=V(new_g)$size,vertex.frame.color=240,#vertex.frame.width=0.02,
layout=layout.spring#layout.fruchterman.reingold#
,edge.arrow.size=0.01,edge.color=240,edge.width=0.03)#layout=layout.fruchterman.reingold,
#legend('bottomright',c('Frequently updated packages','Slowly updated packages','Others'),pch=21,pt.bg=c('red','yellow','white'),bty='n',cex=1,horiz=F)
dev.off()
# ###########
# #Main Page, http://csgillespie.wordpress.com/2011/03/23/graphical-display-of-r-package-dependencies/
# url = "http://cran.r-project.org/web/packages/"
# cran_web_page = paste(readLines(url), collapse="")
#
# main_table = gsub('.*.*', "\\1", cran_web_page)
# main_table = gsub('
', "", main_table)
#
# depends_on =
# gsub('[0-9A-Za-z\\.]* | .*? |
',
# "\\1 ", main_table)