## ----setup, message=FALSE, warning=FALSE--------------------------------------
library(ggpubr)
library(gtools)
library(purrr)
library(scales)
library(pheatmap)
library(data.table)
library(gridExtra)
library(png)
library(knitr)
library(grid)
library(styler)
library(FactoMineR)
library(factoextra)
library(magick)
library(rlang)
library(GGally)
library(ggplotify)
library(remotes)
library(dplyr)
library(tidyr)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning = FALSE,
fig.align = 'center',
dev = "png",
tidy='styler', tidy.opts=list(strict=TRUE))
## Color Format
colFmt <- function(x,color) {
outputFormat <- knitr::opts_knit$get("rmarkdown.pandoc.to")
if(outputFormat == 'latex') {
ret <- paste("\\textcolor{",color,"}{",x,"}",sep="")
} else if(outputFormat == 'html') {
ret <- paste("<font color='",color,"'>",x,"</font>",sep="")
} else {
ret <- x
}
return(ret)
}
## ----install_package, eval=FALSE----------------------------------------------
# ## install from BioConductor
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install('protGear')
## ----call_protGear------------------------------------------------------------
## load the package
suppressWarnings(library(protGear))
## ----array_params-------------------------------------------------------------
## specify the the parameters to process the data
genepix_vars <- array_vars(channel="635" ,
chip_path = system.file("extdata/array_data/", package="protGear"),
totsamples = 21,
blockspersample = 2,
sampleID_path = system.file("extdata/array_sampleID/", package="protGear"),
mig_prefix = "_first",
machine =1,
## optional
date_process = "0520")
## ----gpr_header---------------------------------------------------------------
header_gpr <- readLines(system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"),
n=40)
header_gpr <- gsub("\"", "", header_gpr[1:32])
header_gpr[1:32]
## ----slide_struct, fig.align='left'-------------------------------------------
visualize_slide(infile=system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"),
MFI_var ='F635 Median' )
## ----slide_struct_2d, fig.align='left'----------------------------------------
visualize_slide_2d(infile =system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"),
MFI_var ='B635 Median' )
## ----array_files--------------------------------------------------------------
#### read in all the datasets
### list all the file names under data folder
filenames <- list.files(file.path(genepix_vars$paths[[1]]),
pattern="*.txt$|*.gpr$", full.names=FALSE)
#' @___________________read_in_the_files_with_the_text_data_from_the_chip_____________________________
### read all the data files and save them in a list
data_path <- paste0(genepix_vars$paths[[1]],"/")
data_files <- purrr::map(.x = filenames,
.f = read_array_files,
data_path=data_path ,
genepix_vars=genepix_vars)
data_files <- set_names(data_files, purrr::map(filenames, name_of_files))
## ----bg_data------------------------------------------------------------------
## utilising the map package we process a number of files under data_files list
dfs <- names(data_files)
allData_bg <- purrr::map(.x=dfs, .f=extract_bg,data_files=data_files,genepix_vars)
allData_bg <- set_names(allData_bg, purrr::map(filenames, name_of_files))
allData_bg <- plyr::ldply(allData_bg)
## ----bg_vs_fg-----------------------------------------------------------------
p1 <- plot_FB(allData_bg,antigen_name="antigen",
bg_MFI="BG_Median",FG_MFI="FBG_Median",log=FALSE)
p1
## ----bg_plot------------------------------------------------------------------
p2 <- plot_bg(df=allData_bg, x_axis="Block",bg_MFI="BG_Median",
log_mfi=TRUE)
p2
## ----bg_correct---------------------------------------------------------------
sample_ID_merged_dfs <- purrr::map(.x=dfs, .f=merge_sampleID ,data_files=data_files ,
genepix_vars, method="subtract_local")
sample_ID_merged_dfs <- set_names(sample_ID_merged_dfs, purrr::map(filenames, name_of_files))
## ----buffer_spots_data--------------------------------------------------------
buffer_transp <- purrr::map(.x=sample_ID_merged_dfs, .f=buffer_spots , buffer_spot="buffer")
buffer_transp <- set_names(buffer_transp, purrr::map(filenames, name_of_files))
buffers <- plyr::ldply(buffer_transp)
plot_buffer(buffers,buffer_names="antigen",buffer_mfi="FMedianBG_correct",slide_id=".id")
## ----replicates_plot----------------------------------------------------------
#' @________________________________calculated_cv_for_each_data_file_______________________________________
#' data without the selected mean for the best 2 CVs
dataCV <- purrr::map(.x=sample_ID_merged_dfs, .f=cv_estimation ,lab_replicates=3 ,
cv_cut_off=20,
sampleID_var='sampleID', antigen_var='antigen' ,replicate_var='replicate' ,
mfi_var='FMedianBG_correct')
lab_replicates=3
dataCV <- set_names(dataCV, purrr::map(filenames, name_of_files))
aa <- plyr::ldply(dataCV)
GGally::ggpairs(aa,aes(color=cvCat_all) ,
columns = paste(seq_len(lab_replicates)), title = "", axisLabels = "show") +
theme_light()
## ----cv_summary---------------------------------------------------------------
#' @________________________________summary_of_cv_for_each_sample________________________________________
#' creates summaries by cv's greater than 20 and less than 20
cv_cut_off <- 20
dataCV_sample <- purrr::map(.x=dataCV, .f=protGear::cv_by_sample_estimation , cv_variable="cvCat_all",
lab_replicates=3)
dataCV_sample <- set_names(dataCV_sample, purrr::map(filenames, name_of_files))
all_cv_sample <- plyr::ldply(dataCV_sample)
## ----cv_by_sample-------------------------------------------------------------
less_20 <- rlang::sym(paste0("CV <= ",cv_cut_off, "_perc"))
gt_20 <- rlang::sym(paste0("CV > ",cv_cut_off, "_perc"))
less_20_per <- rlang::sym(paste0("% CV <=",cv_cut_off))
gt_20_per <- rlang::sym(paste0("% CV >",cv_cut_off))
ggplot(all_cv_sample)+
geom_violin(aes(.id,`CV <= 20_perc`, color="% CV =<20")) +
geom_violin(aes(.id,`CV > 20_perc`, color="% CV >20")) +
geom_violin(aes(.id,Others_perc,color="Others")) +
ylab("% of CV") +
theme_minimal() +
ggtitle("% of CV >20 or <=20 for each slide all repeats considered")
## ----best_reps----------------------------------------------------------------
#' @________________________________data_with_selected_best_2_CV_______________________________________
#' data with the selected mean for the best 2 CVs
dataCV_best2 <- purrr::map(.x=dataCV, .f=best_CV_estimation , slide_id="iden",lab_replicates=3,
cv_cut_off=20)
## give the names to the returned list
dataCV_best2 <- set_names(dataCV_best2, purrr::map(filenames, name_of_files))
dataCV_sample_best2 <- purrr::map(.x=dataCV_best2, .f=cv_by_sample_estimation ,
cv_variable="best_CV_cat",lab_replicates=3)
dataCV_sample_best2 <- set_names(dataCV_sample_best2, purrr::map(filenames, name_of_files))
all_cv_sample_best2 <- plyr::ldply(dataCV_sample_best2)
## ----best_cv_plot-------------------------------------------------------------
## plot only the CV perccentages
ggplot(all_cv_sample_best2)+
geom_violin(aes(.id,`CV <= 20_perc`, color="% CV =<20")) +
geom_violin(aes(.id,`CV > 20_perc`, color="% CV >20")) +
geom_violin(aes(.id,Others_perc,color="Others")) +
ylab("% of CV") +
theme_minimal() +
ggtitle("% of CV >20 or <=20 for each slide")
## ----tag_file-----------------------------------------------------------------
tag_file <- read.csv(system.file("extdata/TAG_antigens.csv", package="protGear"))
tag_antigens <- c("CD4TAG" , "GST", "MBP")
batch_vars <- list(machine="m1", day="0520")
## ----tag_glimpse--------------------------------------------------------------
tb1 <- data.frame(head(tag_file, n=10))
tb1 %>%
knitr::kable()
## ----tag_subtract-------------------------------------------------------------
#' @________________________________subtract_the_tag_values_______________________________________
#'
## tag subtract
## read in the KILCHip TAG file to substract GST-1, MBP -2 and CD4TAG - 0 file
dataCV_tag <- purrr::map(.x=dataCV_best2, .f=tag_subtract ,
tag_antigens=tag_antigens, mean_best_CV_var="mean_best_CV",tag_file=tag_file,
antigen_var='antigen',
batch_vars=batch_vars)
dataCV_tag <- set_names(dataCV_tag, purrr::map(filenames, name_of_files))
dataCV_tag <- plyr::ldply(dataCV_tag)
## ----before_after_tag---------------------------------------------------------
aaa <- dataCV_tag %>%
filter(TAG_name=="GST")
aaa <- aaa %>%
dplyr::select(.id,sampleID,antigen,mean_best_CV,mean_best_CV_tag)
aaa <- aaa %>%
gather(measure,mfi,-c(.id:antigen))
ggplot(aaa,aes(as.factor(antigen),mfi,color=measure)) +
geom_boxplot(aes(fill=measure),alpha=0.5)+
theme_light() +
xlab("antigen name")+
ggtitle("Before and after TAG subtraction") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## ----normalisation------------------------------------------------------------
df_to_normalise <- dataCV_tag %>% ungroup() %>%
dplyr::select(slide=.id,sampleID,sample_array_ID,antigen,mean_best_CV) %>%
group_by(sampleID, slide)
df_to_normalise$sample_index <- group_indices(.data =df_to_normalise )
###
to_normalise <- df_to_normalise %>%
ungroup() %>% dplyr::select(-slide,-sampleID,-sample_array_ID) %>%
dplyr::select(antigen, sample_index, everything()) %>%
gather(variable, value, -(antigen:sample_index)) %>%
unite(temp, antigen ) %>% dplyr::select(-variable) %>%
spread(temp, value) %>%
as.data.frame(.)
### get the row names of the machine data
row.names(to_normalise) <- to_normalise$sample_index
#batch_all <- as.factor(paste0(to_normalise$machine,"/",to_normalise$day))
#machines <- as.factor(to_normalise$machine)
#day_batches <- as.factor(to_normalise$day)
## create the matrix to normalise
matrix_antigen <- to_normalise %>%
dplyr::select(-sample_index) %>%
as.matrix(.)
## create the matrix to hold the important parameters
## in place of AMA1 you use one of your features or antigen
array_matrix <- df_to_normalise %>%
filter(antigen=="AMA1") %>%
ungroup() %>%
dplyr::select(sample_array_ID,sample_index,slide)
control_antigens <- c("CommercialHumanIgG","CD4TAG")
## ----normalised_df------------------------------------------------------------
normlise_df <- matrix_normalise(matrix_antigen, method = "vsn", array_matrix=array_matrix,
return_plot = TRUE,control_antigens=control_antigens)
normlise_df$plot_normalisation
## ----compare_methods----------------------------------------------------------
control_antigens <- c("CommercialHumanIgG","CD4TAG")
## no normalisation
normalise_list_none <- matrix_normalise(matrix_antigen=matrix_antigen,
method = "none",
array_matrix=array_matrix,
return_plot = TRUE,
control_antigens=control_antigens)
names(normalise_list_none) <- c("matrix_antigen_none" ,"plot_none")
## log2 normalisation
normalise_list_log <- matrix_normalise(matrix_antigen=matrix_antigen,
method = "log2",
array_matrix=array_matrix,
return_plot = TRUE,
control_antigens=control_antigens)
names(normalise_list_log) <- c("matrix_antigen_log" ,"plot_log")
## vsn normalisation
normalise_list_vsn <- matrix_normalise(matrix_antigen=matrix_antigen,
method = "vsn",
array_matrix=array_matrix,
return_plot = TRUE,
control_antigens=control_antigens)
names(normalise_list_vsn) <- c("matrix_antigen_vsn" ,"plot_vsn")
## cyclic loess with log
normalise_list_cyclic_loess_log <- matrix_normalise(matrix_antigen=matrix_antigen,
method = "cyclic_loess_log",
array_matrix=array_matrix,
return_plot = TRUE,
control_antigens=control_antigens)
names(normalise_list_cyclic_loess_log) <- c("matrix_antigen_cyclic_loess_log" ,
"plot_cyclic_loess_log")
normalise_list_rlm <- matrix_normalise(matrix_antigen=matrix_antigen,
method = "rlm",
array_matrix=array_matrix,
return_plot = TRUE,
control_antigens=control_antigens)
names(normalise_list_rlm) <- c("matrix_antigen_rlm" ,"plot_rlm")
## create a list after normalisation
normalised_list <- c(normalise_list_none ,
normalise_list_log,
normalise_list_vsn,
normalise_list_cyclic_loess_log,
normalise_list_rlm)
##
normalised_list_plot <- normalised_list[grepl("plot",names(normalised_list))]
## ----plot_comparison, fig.align='center', fig.width=12, fig.height=15---------
p <- do.call("grid.arrange", c(normalised_list_plot, ncol=2))
## ----heat_norm----------------------------------------------------------------
norm_df <- normlise_df$matrix_antigen_normalised
norm_df <- norm_df %>%
dplyr::select(-control_antigens)
p3 <- pheatmap::pheatmap(norm_df ,scale = "none", cluster_rows = FALSE,
main=paste('VSN',"Normalised Data"),
silent = TRUE)
#-------
## if you want to save the file
# p3 <- ggplotify::as.ggplot(p3)
# p <- p3 + theme_void()
# ggsave(p ,
# filename ="heatmap.PNG" ,
# width = 16 , height = 12 ,
# limitsize = FALSE,
# dpi=200 )
#-------
p3
## ----pca, fig.align='center',fig.width=16,fig.height=12-----------------------
norm_df <- normlise_df$matrix_antigen_normalised
res_pca <- prcomp( norm_df, scale = TRUE)
var <- get_pca_var(res_pca)
vars_visualise=20
#Visualize the PCA
## individuals contributing to the PCA
p1 <- fviz_pca_ind(res_pca,
col.var = "contrib", # Color by contributions to the PC
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)+ theme_minimal()
# Select the top vars_visualise contributing variables
p2 <-fviz_pca_biplot(res_pca, label="var",
select.var = list(contrib = vars_visualise)) +
theme_minimal()
# Total cos2 of variables on Dim.1 and Dim.2
p3 <- fviz_cos2(res_pca, choice = "var", axes = 1:2 , top = vars_visualise)
# Color by cos2 values: quality on the factor map
p4 <- fviz_pca_var(res_pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
select.var = list(contrib = vars_visualise),
repel = TRUE # Avoid text overlapping
)
## combine the plots into one grid
## combine the plots into one grid
## combine the plots into one grid
p_pca <- gridExtra::grid.arrange(p1,p2,p3,p4, ncol=2 )
## If you want to save the file
# ggsave(p_pca ,
# filename ="p_pca.PNG" ,
# width = 16 , height = 12 ,
# units = "in",
# limitsize = FALSE,
# dpi=300)
p_pca
## ----shiny_launch, eval=FALSE-------------------------------------------------
# protGear::launch_protGear_interactive()
## -----------------------------------------------------------------------------
sessionInfo()