pGLS in R
Phylogenetic generalized least squares analysis in R¶
We start by importing packages:
library(ape)
library(phytools)
library(geiger)
library(nlme)
library(caper)
library(dplyr)
library(IRdisplay)
We now read in the data to analyze. For pGLS, we will work with species means, so we prepare that dataframe here.
df<-read.csv("20190322_supp_table_2.csv", header=TRUE)
df_ave = data.frame(aggregate(select(df, 'mass..g.', 'area..mm.2.', 'depth..mm.'), list(df$species, df$spiracle), mean))
colnames(df_ave) = c('species', 'spiracle', 'mass.g', 'area.mm.2', 'depth.mm')
print('Raw dataframe')
head(df)
print('Species averaged dataframe')
head(df_ave, 15)
To do pGLS, we need a phylogenetic tree of our study species. We will manually define that here and visualize it to makes sure it matches the tree we spliced together and show here.
tree4 <- read.tree(text="(((Cyclocephala_borealis:1.0,(Dynastes_hercules:1.0,Trypoxylus_dichotomus:1.0):1.0):1.0,Popilia_japonica:1.0):1.0,(Protaetia_orientalis:1.0,((((Coelorrhina_hornimani:1.0,Dicronorrhina_derbyana:1):1,Mecynorrhina_torquata:1.0):1,Eudicella_euthalia:1.0):1,Goliathus_goliathus:1.0):1.0):1.0):1.0;")
plot.phylo(tree4)
Looks good! We will now sort the data so as to match the tree for analysis.
spir <- 'T'
tree <- tree4
digits <- 5
tips <- gsub("_", " ", tree$tip.label)
tree$tip.label <- tips
df_ave_sort = df_ave[df_ave['spiracle'] == spir,][match(tips, df_ave[df_ave['spiracle'] == spir,]$species),]
df_ave_sort$log_area_dist <- round(log10(df_ave_sort$area.mm.2/df_ave_sort$depth.mm), digits = digits)
df_ave_sort$log_area <- round(log10(df_ave_sort$area.mm.2), digits = digits)
df_ave_sort$log_mass <- round(log10(df_ave_sort$mass.g), digits = digits)
df_ave_sort$log_dist <- round(log10(df_ave_sort$depth.mm), digits = digits)
df_ave_sort
This also looks good, though is reversed as compared to the branch tips on the tree. We hence reverse the dataframe an then load it into an object for use with the package that does pGLS.
comparative_data <- comparative.data(tree, df_ave_sort[nrow(df_ave_sort):1,], species)
With this data structure in hand, we can run the model. We will also run a linear regression with ordinary least squares to compare to the phylogenetic model. We use the maximum likelihood to select the $\lambda$ parameter that describes the strength of the phylogenetic covariance. We will look more into this parameter later.
model1<-pgls(log_area~log_mass, comparative_data, lambda="ML")
model2<-lm(log_area~log_mass, df_ave_sort)
plot(log_area~log_mass, data=df_ave_sort)
abline(model1, col='blue')
abline(model2, col='red')
legend ("bottomright",legend=c("lm", "pgls"),cex =0.75,lty=c(1,1,1),inset=0.03, seg.len = 2, col=c('red', 'blue'),bg="white")
This looks good! The two models give similar outputs. We now define a function to generate such a figure for models, which will be useful when we wish to make a bunch of models for each spiracle.
panel_fig <- function(model1, model2, data, o, reg, slope, x, y){
#pdf(paste("./plots/", paste(paste(o, reg, sep=""), "_regressions.pdf", sep = ""), sep = ""), width=10, height=4)
png(paste("./plots/", paste(paste(o, reg, sep=""), "_regressions.png", sep = ""), sep = ""), width=1000, height=400)
par(mfrow=c(1,2),mar = c(2,3,2,0))
par(lwd=1)
par(mar = c(3,3,2,1) + 0.1)
plot(x, y,main=paste("Spiracle ", o, " Slopes"))
for(ent in seq(-6,4,(max(y)-min(y))/20 )) {
abline(coef =c(ent,slope), col=rgb(red = 0.2,green = 0.2,blue = 0.2,alpha=0.1))
}
abline(model1,col='blue')
abline(model2,col='red')
title(ylab=paste("log(", reg, ")"), line=2, cex.lab=1)
title(xlab="log(mass)", line=2, cex.lab=1)
par(font=2)
#legend ("topleft", legend="d)",bty = "n")
par(font=1)
legend ("bottomright",legend=c("isometry","lm", "pgls"),cex =0.75,lty=c(1,1,1),inset=0.03, seg.len = 2,
col=c(rgb(red = 0.2,green = 0.2,blue = 0.2,alpha=0.1),
'red',
'blue'),
bg="white")
profile_lambda=pgls.profile(model1, which="lambda") # vary lambda
par(mar=c(7,4,2,2))
plot(profile_lambda)
dev.off()
}
prep_data <- function(spir, tree, digits, df_ave) {
tips <- gsub("_", " ", tree$tip.label)
tree$tip.label <- tips
df_ave_sort = df_ave[df_ave['spiracle'] == spir,][match(tips, df_ave[df_ave['spiracle'] == spir,]$species),]
df_ave_sort$log_area_dist <- round(log10(df_ave_sort$area.mm.2/df_ave_sort$depth.mm), digits = digits)
df_ave_sort$log_area <- round(log10(df_ave_sort$area.mm.2), digits = digits)
df_ave_sort$log_mass <- round(log10(df_ave_sort$mass.g), digits = digits)
df_ave_sort$log_area_2_dist <- round(log10((df_ave_sort$area.mm.2^2)/df_ave_sort$depth.mm), digits = digits)
df_ave_sort$log_dist <- round(log10(df_ave_sort$depth.mm), digits = digits)
return(df_ave_sort)
}
We can now loop through all the spiracles and generate figures for each of their morphological values.
spir <- '5'
tree <- tree4
tips <- gsub("_", " ", tree$tip.label)
tree$tip.label <- tips
digits <- 4
df_con <- df_ave
for (spir in c('T', 'S', '1', '2', '3', '4', '5', '6')) {
df_ave_sort <- prep_data(spir, tree, digits, df_ave)
comparative_data <- comparative.data(tree, df_ave_sort[nrow(df_ave_sort):1,], species)
model1<-pgls(log_area~log_mass, comparative_data, lambda="ML")
model2<-lm(log_area~log_mass, df_ave_sort)
model3<-pgls(log_area_dist~log_mass, comparative_data, lambda="ML")
model4<-lm(log_area_dist~log_mass, df_ave_sort)
model5<-pgls(log_area_2_dist~log_mass, comparative_data, lambda="ML")
model6<-lm(log_area_2_dist~log_mass, df_ave_sort)
model7<-pgls(log_dist~log_mass, comparative_data, lambda="ML")
model8<-lm(log_dist~log_mass, df_ave_sort)
panel_fig(model1, model2, df_ave_sort, spir, "_area", 0.67, df_ave_sort$log_mass, df_ave_sort$log_area)
panel_fig(model3, model4, df_ave_sort, spir, "_area_over_dist", 0.33, df_ave_sort$log_mass, df_ave_sort$log_area_dist)
panel_fig(model5, model6, df_ave_sort, spir, "_area_2_over_dist", 1.00, df_ave_sort$log_mass, df_ave_sort$log_area_2_dist)
panel_fig(model7, model8, df_ave_sort, spir, "_dist", 0.33, df_ave_sort$log_mass, df_ave_sort$log_dist)
}
With these plots generated, we can visualize them all.
Plot for species averaged mass vs species averaged spiracle area (log transformed)¶
Plot for species averaged mass vs species averaged spiracle depth (log transformed)¶
Species averaged mass vs $\frac{\mathrm{species\,averaged\,area}}{\mathrm{species\,averaged\,depth}}$ (log transformed)¶
Species averaged mass vs $\frac{(\mathrm{species\,averaged\,area})^2}{\mathrm{species\,averaged\,depth}}$ (log transformed)¶
We can see that the phylogenetic and non-phylogenetic models tend to return very similar results to each other. Also note the weakly peaked distribution in log-likelihood space for the $\lambda$ parameter. No model gave a value for $\lambda$ that was significantly different than zero (which represents no phylogenetic signal). The large areas of parameter space that give similar likelihood for the parameter suggests that the data is not much informing this parameter value, that is, it is non-identifiable. We Further show that the parameter is non-identifiable with a Bayesian analyis, but already can see it here as well. Selecting a single maximum likelihood value for the $\lambda$ based on these likelihood functions seems dubious, and hence we opt to not use the phylogenetic covariance model for our analyses.