Lineage Metrics Analysis

This document contains all of the R code used to analyze data for this paper. The input csv files were generated from the raw data via Python scripts that are also included in this directory.

First let’s read in the data:

# Pull in data over all evolutionary time
real_value_data  <- read.csv("real_value_data.csv")

# Convert problem and selection type codes to informative names. Convert things that should be factors to factors
real_value_data$problem <- sapply(as.character(real_value_data$p),switch,"0"="Himmelblau","1"="Six-hump camel back","2"="Shubert", "6"="Composition function 2")
real_value_data$problem <- as.factor(real_value_data$problem)
real_value_data$selection <- sapply(as.character(real_value_data$selection),switch,"0"="Tournament","1"="Lexicase","2"="Eco-EA", "3"="MAP-Elites", "4"="Roulette", "5"="Drift")
real_value_data$ts <- with(real_value_data, ifelse(selection=="Drift", 1, ts)) # Drift is equivalent to tournament size 1
real_value_data$selection <- as.factor(real_value_data$selection)
real_value_data$seed <- as.factor(real_value_data$seed)
real_value_data$elite <- as.factor(real_value_data$elite)

# Grab the portion of data we're working with
tournament_data <- subset(real_value_data, (real_value_data$selection == "Tournament" | real_value_data$selection=="Drift") & real_value_data$elite == 0)
tournament_endpoints <- filter(tournament_data, update==5000)
tournament_endpoints$disp_pairwise_distance <- tournament_endpoints$variance_pairwise_distance/tournament_endpoints$mean_pairwise_distance
tournament_endpoints$disp_evo_distinct <- tournament_endpoints$variance_evolutionary_distinctiveness/tournament_endpoints$mean_evolutionary_distinctiveness

# Pull in data that was calculated post-hoc just for the end points
all_dom_data <- read_csv("all_dom_data.csv")
all_dom_data$update <- 5000
all_dom_data$seed <- all_dom_data$RANDOM_SEED
all_dom_data$RANDOM_SEED <- NULL
all_dom_data$PROBLEM <- NULL
all_dom_data$DATA_DIRECTORY <- NULL
all_dom_data$RUN_MODE <- NULL
all_dom_data$path <- NULL

# Combine data
tournament_endpoints <- merge(tournament_endpoints, all_dom_data)

# Melt the data so that we can easily make graphs comparing different variables
melted_tournament_data <- melt(tournament_endpoints, id.vars = c("update", "selection", "seed", "elite", "mutation_rate", "le", "p", "problem", "g", "ts", "TOURNAMENT_SIZE", "FITNESS_INTERVAL", "RESOURCE_SELECT__OUTFLOW", "MUTATION_STD", "SELECTION_METHOD", "RESOURCE_SELECT__RES_AMOUNT", "HINT_GRID_RES", "RESOURCE_SELECT__RES_INFLOW", "LEXICASE_EPSILON", "SYSTEMATICS_INTERVAL", "RESOURCE_SELECT__FRAC", "RESOURCE_SELECT__COST", "RESOURCE_SELECT__MAX_BONUS", "POP_SNAPSHOT_INTERVAL", "POP_SIZE", "ELITE_SELECT__ELITE_CNT", "GENERATIONS"))

# Pull out just the variables we care about
end_points_vars_of_interest <- melted_tournament_data %>%  filter(variable %in% c("mrca_depth", "dominant_deleterious_steps", "current_phylogenetic_diversity", "mean_pairwise_distance", "phenotypic_volatility", "total_magnitude"))

# Pull out vairables for supplementary info
extended_end_points_vars_of_interest <- melted_tournament_data %>%  filter(variable %in% c("mrca_depth", "dominant_deleterious_steps", "current_phylogenetic_diversity", "mean_pairwise_distance", "phenotypic_volatility", "total_magnitude", "variance_pairwise_distance", "mean_evolutionary_distinctiveness", "variance_evolutionary_distinctiveness"))

# Convert variable names to useful axis labels
labels <- c(current_phylogenetic_diversity = "Phylogenetic diversity", dominant_deleterious_steps = "Deleterious steps", mean_evolutionary_distinctiveness="Mean evo. distinctiveness", mean_pairwise_distance="Mean pairwise distance", mrca_depth="MRCA depth", variance_evolutionary_distinctiveness="Variance evo. distinctiveness", variance_pairwise_distance = "Variance pairwise distance", phenotypic_volatility="Phenotypic volatility", total_magnitude="Mutation accumulation")

Tournament size results

Okay, now that we’ve got out data, let’s plot it. First, let’s look at how tournament size effects each of our variables (holding mutation rate constant at .001).

ggplot(subset(end_points_vars_of_interest, end_points_vars_of_interest$mutation_rate==.001), aes(x=ts, y=value, fill=problem, color=problem, group=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Tournament size", breaks=c(1,2,4,8,16)) + scale_y_log10("", labels=scientific, breaks = scales::trans_breaks("log10", function(x) 10^x)) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels),ncol=2) + scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

ggsave("../figs/all_ts.png", width=3.5, height=4.5)

We don’t have space for this in the paper, but let’s also take a look at the next mutation rate up, just to make sure this isn’t being driven by the mutation rate. While we’re at it, lets also graph some additional metrics:

ggplot(subset(extended_end_points_vars_of_interest, end_points_vars_of_interest$mutation_rate==.01), aes(x=ts, y=value, fill=problem, color=problem, group=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Tournament size", breaks=c(1,2,4,8,16)) + scale_y_log10("", labels=scientific, breaks = scales::trans_breaks("log10", function(x) 10^x)) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels),ncol=2) + scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

And lets look at the lowest mutation rate, too:

ggplot(subset(extended_end_points_vars_of_interest, end_points_vars_of_interest$mutation_rate==.00000001), aes(x=ts, y=value, fill=problem, color=problem, group=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Tournament size", breaks=c(1,2,4,8,16)) + scale_y_log10("", labels=scientific, breaks = scales::trans_breaks("log10", function(x) 10^x)) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels),ncol=2) + scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

Mutation rate results

Okay, now holding tournament size constant at 4, what happens when we vary mutation rate?

ggplot(subset(end_points_vars_of_interest, end_points_vars_of_interest$ts==4), aes(x=mutation_rate, y=value, fill=problem, color=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Mutation rate", labels=scientific) + scale_y_log10("", breaks = scales::trans_breaks("log10", function(x) 10^x), labels=scientific) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels), ncol=2) +scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

ggsave("../figs/all_mutation_rate.png", width=3.5, height=4.5)

And lets try a couple moure tournament sizes, just to be sure:

ggplot(subset(extended_end_points_vars_of_interest, end_points_vars_of_interest$selection=="Drift"), aes(x=mutation_rate, y=value, fill=problem, color=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Mutation rate", labels=scientific) + scale_y_log10("", breaks = scales::trans_breaks("log10", function(x) 10^x), labels=scientific) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels), ncol=2) +scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

ggplot(subset(extended_end_points_vars_of_interest, end_points_vars_of_interest$ts==2), aes(x=mutation_rate, y=value, fill=problem, color=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Mutation rate", labels=scientific) + scale_y_log10("", breaks = scales::trans_breaks("log10", function(x) 10^x), labels=scientific) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels), ncol=2) +scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

ggplot(subset(extended_end_points_vars_of_interest, end_points_vars_of_interest$ts==8), aes(x=mutation_rate, y=value, fill=problem, color=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Mutation rate", labels=scientific) + scale_y_log10("", breaks = scales::trans_breaks("log10", function(x) 10^x), labels=scientific) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels), ncol=2) +scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

Coexistance results

A lot of these metrics will be substantially changed by having coexistence. Let’s try using Eco-EA to generate coexistence and see how it affects our metrics.

#Coexistence figure

# Grab the portion of data we're working with
eco_ea_data <- subset(real_value_data, real_value_data$selection == "Eco-EA" & real_value_data$elite == 1)
eco_ea_endpoints <- filter(eco_ea_data, update==5000)
eco_ea_endpoints$disp_pairwise_distance <- eco_ea_endpoints$variance_pairwise_distance/eco_ea_endpoints$mean_pairwise_distance
eco_ea_endpoints$disp_evo_distinct <- eco_ea_endpoints$variance_evolutionary_distinctiveness/eco_ea_endpoints$mean_evolutionary_distinctiveness

eco_ea_endpoints <- merge(eco_ea_endpoints, all_dom_data)
melted_eco_ea_data <- melt(eco_ea_endpoints, id.vars = c("update", "selection", "seed", "elite", "mutation_rate", "le", "p", "problem", "g", "ts", "TOURNAMENT_SIZE", "FITNESS_INTERVAL", "RESOURCE_SELECT__OUTFLOW", "MUTATION_STD", "SELECTION_METHOD", "RESOURCE_SELECT__RES_AMOUNT", "HINT_GRID_RES", "RESOURCE_SELECT__RES_INFLOW", "LEXICASE_EPSILON", "SYSTEMATICS_INTERVAL", "RESOURCE_SELECT__FRAC", "RESOURCE_SELECT__COST", "RESOURCE_SELECT__MAX_BONUS", "POP_SNAPSHOT_INTERVAL", "POP_SIZE", "ELITE_SELECT__ELITE_CNT", "GENERATIONS"))
eco_ea_vars_of_interest <- melted_eco_ea_data %>%  filter(variable %in% c("mrca_depth", "dominant_deleterious_steps", "current_phylogenetic_diversity", "mean_pairwise_distance", "phenotypic_volatility", "total_magnitude"))

extended_eco_ea_vars_of_interest <- melted_eco_ea_data %>%  filter(variable %in% c("mrca_depth", "dominant_deleterious_steps", "current_phylogenetic_diversity", "mean_pairwise_distance", "phenotypic_volatility", "total_magnitude", "mean_evolutionary_distinctiveness", "variance_evolutionary_distinctiveness", "variance_pairwise_distance"))


ggplot(eco_ea_vars_of_interest, aes(x=mutation_rate, y=value, fill=problem, color=problem, group=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Mutation rate") + scale_y_log10("", labels=scientific, breaks = scales::trans_breaks("log10", function(x) 10^x)) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels),ncol=2) + scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")

ggsave("../figs/eco_mutation_rate.png", width=3.5, height=4.5)

Looks like theres some interesting stuff going on with some of the other variables, but we definitely don’t have space in the paper to explore it:

ggplot(extended_eco_ea_vars_of_interest, aes(x=mutation_rate, y=value, fill=problem, color=problem, group=problem)) + stat_summary(fun.y = median, fun.ymin = function(x){quantile(x)[2]}, fun.ymax=function(x){quantile(x)[4]}) +stat_summary(geom="smooth", fun.data="mean_cl_boot")+ theme_classic() + scale_x_log10("Mutation rate") + scale_y_log10("", labels=scientific, breaks = scales::trans_breaks("log10", function(x) 10^x)) + facet_wrap(~variable, scales="free_y", labeller = labeller(variable=labels),ncol=2) + scale_color_discrete("Problem") + theme(legend.position = "bottom", legend.title = element_text(size=10), legend.text = element_text(size=6), strip.text = element_text(size=6), axis.text = element_text(size=6)) + guides(color=guide_legend(ncol=2), fill="none")