## Written by Josefin Stiller 2023
library(tidyverse)
library(magrittr)
library(plotrix)

## Set working directory to where this R script sits
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

## Read the master table with metrics for all gene trees
t <- read.table("master_table_gene_trees.txt", header=T, sep="\t")

## Keep only intergenic regions and those that are assigned to chromosome types and those that are part of the 80k locus set (excluding exons)
t %>% filter(datatype=="random_region" & !is.na(chr_type) & part_of_80K == "Y") -> t

## For each chromosome, calculate the mean % GC standard deviation and SE
t %>% group_by(chromosome, chr_type) %>% summarize(mean=mean(GC_sd), se=std.error(GC_sd)) -> t2

## Some reformatting of chromosome names for plotting
t2$chr_num <- factor(gsub("chr", "", t2$chromosome), levels=c(1:28, "Z"))
t2$chr_num[which(is.na(t2$chr_num))] <- "Z"

## Set up chromosome type colors  
colors <- c("#AACC8E","#5FBFA2","#0091AD", "#4D4D4D")

## Plot EDF10b
t2 %>% ggplot(aes(x=chr_num, y=mean)) + 
  geom_pointrange(aes(ymin=mean-se, ymax=mean+se, color=chr_type), size=0.3) +
  theme_classic(base_size = 6) + scale_color_manual(values=colors) + 
  theme(legend.position = "none") +
  xlab("Chromosome") +
  geom_hline(yintercept = mean(t$GC_sd), linetype=2) +
  scale_y_continuous(name="% GC standard deviation", labels = percent, 
                     ## Add a bit more space at bottom for n gene trees 
                     limits=c(0.01,0.038)) +
  scale_fill_manual(values=colors)

## Save plot to PDF
ggsave("EDF10b.pdf", width=9, height=6.1, units="cm")

## Sample size
t %>% group_by(chromosome) %>% count()
t %>% group_by(chr_type) %>% count()