---
title: "Accurate gene consensus at low nanopore coverage"
author:
- name: Rocio Espada
  affiliation: ESPCI Paris
package: single
output:
  BiocStyle::html_document
abstract: |
  This file contains the full code for replicating the analysis on our manuscript.
vignette: |
  %\VignetteIndexEntry{single_fullcode}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
df_print: kable
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
knitr::opts_chunk$set(engine.opts = list(bash = "-l"))
# knitr::opts_knit$set(root.dir ="~/SINGLe/")

```

This code reproduces the analysis made on our manuscript "Accurate gene consensus at low nanopore coverage", from the raw fast5 files returned by the Nanopore Sequencer to final figures.

Here, we applied SINGLe to the gene of KlenTaq, a truncated variant of the well-known Taq polymerase, of approximately 1.7 kb in length. We first tested it on a small set of seven known mutants containing 2 to 9 point mutations (Small Set, or KT7), and later a larger library of approximately 1200 variants (KTLib).

Make sure that in the path were you are running this code there is:

* Folder 1_ReferenceFiles that has:
    - KT7_sequences_aligned.fasta  
    - KTrefamplicon.fasta
    - KTB_digested.fasta           
    - KTrefORF_1662.fasta
* Folder 2_NanoporeRawReads that has:
    - SmallSet_fast5 (a folder. If you have the tar, un tar it)
    - LargeSet_fast5 (a folder. If you have the tar, un tar it)
* Replace the path in the previous chunk by the correct root.dir


The chunks on the general inputs section load packages, functions and variables that are needed for different chunks below. They do not use a lot of memory, so I'd recommend running them all before running the particular section you are interested in. 

Also, sections are not independent between them. The clearest example is the single_train variable that it is obtained in the Single correction for the small library and then it is used for the correction of the large library long after. 

You'll need to have installed the following software:

* minimap2 (https://github.com/lh3/minimap2)
* ont_fast5_api (https://github.com/nanoporetech/ont_fast5_api)
* nanopolish (https://github.com/jts/nanopolish)
* racon (https://github.com/isovic/racon)
* medaka (https://github.com/nanoporetech/medaka)
* samtools (http://www.htslib.org/)

# General inputs

This includes loading needed packages, defining functions and loading data that will be used in some parts of the code. It's fast and light, so just run all chunks before going on.


Packages
```{r, eval=FALSE}
require(single)
suppressPackageStartupMessages(require(plyr))
suppressPackageStartupMessages(require(dplyr))
options(dplyr.summarise.inform=F)
suppressPackageStartupMessages(require(reshape2))
suppressPackageStartupMessages(require(foreach))
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(ggbreak))
suppressPackageStartupMessages(require(grid))
suppressPackageStartupMessages(require(gridExtra))
suppressPackageStartupMessages(require(stringr))
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(require(cowplot))
suppressPackageStartupMessages(library(e1071))
```

Functions
```{r, eval=FALSE}
detect_homopolymer_positions <- function(sequence){
  l <- length(sequence)
  homopolymer_positions <- list()
  aux_positions <- 1
  counter <- 0
  for(i in seq_along(sequence)[-l]){
    if (sequence[i+1]==sequence[i]){
      aux_positions <- c(aux_positions,i+1)
    }else{
      counter <- counter+1
      homopolymer_positions[[counter]] <- aux_positions
      aux_positions <- i+1
    }
  }
  if(sequence[i+1]==sequence[i]){
    counter <- counter+1
    homopolymer_positions[[counter]] <- aux_positions
  }
  return(homopolymer_positions)
}
biostr_to_char <- function(x){
    y <- strsplit(as.character(x), split="")[[1]]
    return(y)
}
detect_mutations <- function(sequence,reference){
    ind <- which(sequence != reference)
    muts <- paste0(reference[ind],ind,sequence[ind])
    return(muts)
}
replace_str_in_pos   <- function(string, pos, replacement){
  if(pos>nchar(string)){stop('Replacement intended outsife string')}
  string_vec   <- strsplit(string, split="")[[1]]
  string_vec[pos] <- replacement
  string_out <- paste(string_vec, collapse = "")
  return(string_out)
}
bc_one_mutation    <- function(x){
  bases <- c("A","C","G","T")
  y <- c()
  n <- 0
  for(i in 1:nchar(x)){
    n <- n+1
    non_wt <- setdiff(bases, substr(x, i,i))
    non_wt <- paste0(non_wt, collapse = "")
    non_wt <- paste0("[",non_wt, "]", collapse = "")
    y[n] <- replace_str_in_pos(x, i, non_wt)
  }
  for(i in 2:(nchar(x)-1)){
    n <- n+1
    y[n] <- paste0(substr(x,1,(i-1)),substr(x,(i+1), nchar(x)))
  }
  y <- unique(y)
  return(y)
}
dna_reverse_complement_vector <- function(x){
  y <- rev(dna_complement_vector(x))
  return(y)
}
dna_complement <- function(x){
  if(length(x)>1){stop("dna_complement: x has length > 1")}
  if(x=="A" | x=="a"){ y <- "T"}
  if(x=="C" | x=="c"){ y <- "G"}
  if(x=="G" | x=="g"){ y <- "C"}
  if(x=="T" | x=="t"){ y <- "A"}

  return(y)
}
dna_complement_vector <- function(x){
  y <- vapply(x, dna_complement, USE.NAMES = F)
  return(y)
}
dna_complement_string <- function(x){
  x <- strsplit(x, split="")[[1]]
  y <- dna_complement_vector(x)
  y <- paste(y, collapse = "")
  return(y)
}
dna_reverse_complement_string <- function(x){
  x <- strsplit(x, split="")[[1]]
  y <- dna_reverse_complement_vector(x)
  y <- paste(y, collapse = "")
  return(y)
}
```

Paths
```{r, eval=FALSE}
path_references  <- "1_ReferenceFiles/"
path_raw_data <- "2_NanoporeRawReads/"
path_save_figures <- "6_Figures/"
path_analysis_lib7 <- "3_Analysis/KT7_1700_2100/"
path_analysis_lib <- "3_Analysis/KTLib/"

if(!dir.exists(path_save_figures)){dir.create(path_save_figures)}
if(!dir.exists(path_analysis_lib7)){dir.create(path_analysis_lib7)}
if(!dir.exists(path_analysis_lib)){dir.create(path_analysis_lib)}
```

Colors
```{r, eval=FALSE}
red      <- rgb(218,0,0,maxColorValue = 255)
green    <- rgb(0,122,0,maxColorValue = 255)
red_tr   <- rgb(230,0,0,alpha = 200, maxColorValue = 255)
blue_tr  <- rgb(0,0,218,alpha = 250, maxColorValue = 255)
green_tr <- rgb(0,122,0,alpha = 200, maxColorValue = 255)
blue     <- rgb(0,0,250, maxColorValue = 255)

single_color     <- "#2727fa"  
medaka_color     <- "#26a026"  
guppy_color      <- "#fa2929"
nanopolish_color <- "#fa8c28"  
noweights_color  <- "#8c26fa"
racon_color      <- "#dcdc26"
nextpolish_color <- grey(.6)
colors_vector <- c(single_color,medaka_color,guppy_color,nanopolish_color,noweights_color, racon_color,nextpolish_color)
names(colors_vector) <- c("single","medaka","nanopore","nanopolish","no_weights", "racon","nextpolish")

colors_vector_capital <- colors_vector
names(colors_vector_capital) <- c("SINGLe","Medaka","Guppy","Nanopolish","No weights","Racon","NextPolish")

methods_capital <- setNames(c("Nanopolish","Medaka","Racon","SINGLe","Guppy","No weights","NextPolish"),
                                              c("nanopolish","medaka","racon","single","nanopore","no_weights","nextpolish"))
```

Plotting themes
```{r, eval=FALSE}
theme_plot <- theme_bw()+
    theme(line=element_line(colour = "black", 
                            size = 0.5, linetype = "dashed",lineend = "square"),
          rect= element_rect(fill = NULL, colour = NULL, 
                             size = 1,linetype = "solid",inherit.blank = FALSE),
          text=element_text(size = 10),title = element_text(size=10),
          axis.title = element_text(size=rel(1)),
          axis.text.x=element_text(angle=0, size=rel(.8)),
          axis.text.y=element_text(angle=0, size=rel(.8)),
          plot.tag = element_text(size=rel(1), face = "bold", vjust=-1),
          panel.grid = element_blank(), panel.border = element_rect(size=1),
          legend.background = element_blank(), 
          plot.margin=margin(t=0,r=3,l=0,b=0, unit="pt"),
          strip.background = element_rect(fill="white"),
          legend.text = element_text(size=rel(0.8))
    )

theme_set(theme_plot)

```

Reference sequence
```{r, eval=FALSE}
reference_file <-  "1_ReferenceFiles/KTrefORF_1662.fasta"
wildtypye_bstr <- readDNAStringSet(reference_file)
wildtype <- toupper(biostr_to_char(wildtypye_bstr))

wt_homopolymers <- detect_homopolymer_positions(wildtype)
wt_homopolymers <- wt_homopolymers[vapply(wt_homopolymers,length)>1]
pos_wt_homopolymers <- unlist(wt_homopolymers)


```

Experimental data for set of KlenTaq's 7 variants
```{r, eval=FALSE}
kt7_barcodes <- paste0("barcode",c("05","06","07","08","09","10","11"))
kt7_true_mutants <- data.frame(
                        Barcode = kt7_barcodes,
                        true_consensus = c("C867G G1120A C1214T G1316A",
                            "G23A C245T", 
                            "G94C A378G A905G G1023T A1381T",
                            "T425A G846A C851T G1403A", 
                            "G307A G490- T659A T675A G993A A1554G",
                            "A41T G331T G349A C772T C879T C1474A C1475T G1530-",
                            "G303A G376A C517G T857A T1056A G1550A"))

kt7_bc2mut <- c("#1 - 2sub","#2 - 4sub","#3 - 4sub","#4 - 5sub","#5 - 6sub","#6 - 7sub 1del","#7 - 5sub 1del")
names(kt7_bc2mut) <- paste0("barcode",c("06","05","08","07","11","10","09"))


mutant_sequences   <- readDNAStringSet("1_ReferenceFiles/KT7_sequences_aligned.fasta") #known sequences of mutants (different from reference_sequence)
kt_true_mutations <- list()
for(i in 1:length(mutant_sequences)){
    mutant_seq <- toupper(stringr::str_split(as.character(mutant_sequences[[i]]), pattern = "")[[1]])
    index <- which (mutant_seq!= wildtype)
    kt_true_mutations[[i]] <- data.frame(sequence_id=rep(names(mutant_sequences)[i], length(index)),
                                      position = index,  nucleotide =mutant_seq[index])
}
kt_true_mutations <- do.call(rbind,kt_true_mutations)

```


Others
```{r, eval=FALSE}
subsets       <- c(seq(3,9,by=1),seq(10,50,by=5))
n_repetitions <- 50
```


# Wild type and small set of 7 variants of KlenTaq

## Experiment

In a unique sequencing run we used a barcoding kit to measure several samples at the same time. The table below indicates the number of the barcode (that will be returned by guppy_barcoder) and it's content

| BarcodeID | VariantID | Mutations                                              |
|-----------|-----------|--------------------------------------------------------|
| BC01      |           | wildtype                                               |
| BC05      | Variant 2 | C867G G1120A C1214T C1316A                             |
| BC06      | Variant 1 | G23A C245T                                             |
| BC07      | Variant 4 | G94C A378G A905G G1023T A1381T                         |
| BC08      | Variant 3 | T425A G846A C851T G1403A                               |
| BC09      | Variant 7 | G307A T659A T675A G993A A1554G G490                    |
| BC10      | Variant 6 | A41T G331T G349A C772T A785G C879T C1474A C1475T G1530 |
| BC11      | Variant 5 | G303A G376A C517G T857A T1056A G1550A                  |



## Basecalling, demultiplexing and aligning to reference

Basecalling was done with the following command lines in bash (replace paths if needed)

```{bash, basecalling, eval= F}
## BASECALL USING GUPPY
guppy_basecaller -i 2_NanoporeRawReads/SmallSet_fast5/ -s 3_Analysis/SmallSet_SUP/ -c dna_r9.4.1_450bps_sup.cfg -x 'cuda:0' -r 

#Merge all files in a unique file
cat 3_Analysis/SmallSet_SUP/pass/*fastq > 3_Analysis/KT7.fastq
# for example: cat SmallSet_SUP/pass/*fastq > SmallSet.fastq

#Filter sequences by length (1700 to 2100 bp)
mkdir 3_Analysis/KT7_1700_2100
awk 'NR % 4 ==2 && length($0) > 1700 && length($0) < 2100 {print f  "\n" $0;getline;print;getline;print}  {f=$0}' 3_Analysis/KT7.fastq > 3_Analysis/KT7_1700_2100/KT7_1700_2100.fastq

#Demultiplexing by Guppy
guppy_barcoder -i 3_Analysis/KT7_1700_2100/ -s 3_Analysis/KT7_1700_2100

#Merge files to one file per barcode
cat 3_Analysis/KT7_1700_2100/barcode01/*fastq >> 3_Analysis/KT7_1700_2100/barcode01.fastq
cat 3_Analysis/KT7_1700_2100/barcode05/*fastq >> 3_Analysis/KT7_1700_2100/barcode05.fastq
cat 3_Analysis/KT7_1700_2100/barcode06/*fastq >> 3_Analysis/KT7_1700_2100/barcode06.fastq
cat 3_Analysis/KT7_1700_2100/barcode07/*fastq >> 3_Analysis/KT7_1700_2100/barcode07.fastq
cat 3_Analysis/KT7_1700_2100/barcode08/*fastq >> 3_Analysis/KT7_1700_2100/barcode08.fastq
cat 3_Analysis/KT7_1700_2100/barcode09/*fastq >> 3_Analysis/KT7_1700_2100/barcode09.fastq
cat 3_Analysis/KT7_1700_2100/barcode10/*fastq >> 3_Analysis/KT7_1700_2100/barcode10.fastq
cat 3_Analysis/KT7_1700_2100/barcode11/*fastq >> 3_Analysis/KT7_1700_2100/barcode11.fastq

```

Align using minimap2 and create count files.

```{bash, minimap2, eval= F}
#!/bin/bash
minimap2 -ax map-ont --sam-hit-only  1_ReferenceFiles/KTrefamplicon.fasta 3_Analysis/KT7_1700_2100/barcode01.fastq > 3_Analysis/KT7_1700_2100/barcode01.sam
samtools view -S -b 3_Analysis/KT7_1700_2100/barcode01.sam > 3_Analysis/KT7_1700_2100/barcode01.bam #TRANSFORM TO BAM
samtools sort 3_Analysis/KT7_1700_2100/barcode01.bam -o 3_Analysis/KT7_1700_2100/barcode01.sorted.bam #SORT BAM FILE

for (( counter=5; counter<12; counter++ ))
    do
if [ $counter -lt 10 ];
then
bc=0$counter
else
    bc=$counter
fi
name=barcode$bc
minimap2 -ax map-ont --sam-hit-only  1_ReferenceFiles/KTrefamplicon.fasta 3_Analysis/KT7_1700_2100/$name.fastq >3_Analysis/KT7_1700_2100/$name.sam
samtools view -S -b 3_Analysis/KT7_1700_2100/$name.sam > 3_Analysis/KT7_1700_2100/$name.bam # TRANSFORM TO BAM
samtools sort 3_Analysis/KT7_1700_2100/$name.bam -o 3_Analysis/KT7_1700_2100/$name.sorted.bam # SORT BAM FILE

echo -n "$bc "
done
printf "\n"
```

## SINGLe correction

```{r, eval=FALSE}
pos_start <- 60
pos_end <- 1721
```

```{r, warning=F, single, eval= F}
tic <- proc.time()

## TRAIN
#Forward
single_train <- single_train(bamfile= "3_Analysis/KT7_1700_2100/barcode01.sorted.bam",
               output = "3_Analysis/KT7_1700_2100/barcode01_5d4" ,
               mean.n.mutations = 5.4,verbose = T,
               rates.matrix = mutation_rate,
               refseq_fasta = reference_file,
               pos_start = pos_start, pos_end = pos_end, 
               save_partial=T, save_final=T)

## EVALUATE
cl <- parallel::makeForkCluster(7)
doParallel::registerDoParallel(cl)
foreach(n = 5:11)%dopar%{
    if(n<10){x=paste0("0",n)}else{x=n}
    single_evaluate(bamfile = paste0("3_Analysis/KT7_1700_2100/barcode",x,".sorted.bam"),
                     single_fits   = single_train,
                     output_file = paste0("3_Analysis/KT7_1700_2100/barcode",x,"_SINGLe.fastq"),
                     refseq_fasta= reference_file,
                     pos_start=pos_start,pos_end=pos_end,
                     verbose=T,gaps_weights="minimum",
                     save_original_scores = T, 
                     save=T)
}

toc <- proc.time()

cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
```

------------------------------------------------------------------------

## Analysis of error rate
Compute error rate on wild type reads
```{r, eval= F}
tic <- proc.time()
reads_wt <- read.table(paste0(path_analysis_lib7,"barcode01_5d4_SINGLe_countsPNQ.txt"), header=T)
reads_wt <- reads_wt %>%
    mutate(isWT= nucleotide==wildtype[pos]) %>%
    dplyr::rename(Position=pos, Nucleotide=nucleotide, counts=count) 

total_error_rate <- reads_wt %>% group_by(isWT) %>% dplyr::summarise(counts=sum(counts))
total_error_perc <- total_error_rate[1,2]/sum(total_error_rate$counts)*100
cat("Error rate:", as.numeric(round(total_error_perc,2)),"%")

## Qscore distribution
reads_wt_qscore_per_position <- reads_wt %>%
    group_by(QUAL, isWT) %>%
    dplyr::summarise(Counts=sum(counts))

## Errors per position
reads_wt_errors_by_position <- reads_wt %>%
    group_by(Position, Nucleotide,isWT) %>%
    dplyr::summarise(Counts=sum(counts)) %>%
    mutate(n_plot =  floor(Position/300))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
```

### Figure 1A
```{r, fig 1A, fig.width=3, fig.height=3, eval= F}
Figure_1A <- ggplot(reads_wt_qscore_per_position,aes(x=QUAL,y=Counts/(10^5),col=isWT))+
    geom_line(lwd=0.5)+
    xlab(expression(Q[score]))+ylab(bquote('Counts'~(x10^5)))+
    scale_color_manual(breaks=c(TRUE,FALSE), labels=c("Correct","Error"),values=c(green_tr,red_tr))+
    theme_plot+
    theme(legend.direction="vertical", 
          legend.position = c(.7,.8),
          legend.spacing.y = unit(0.05,"line"),
          legend.key.height = unit(0.5,"line"),
          legend.key.width = unit(0.6,"line"),
          legend.text=element_text(size=rel(.8)), 
          legend.title=element_blank(), 
          legend.background = element_blank(), 
          plot.margin = margin(t=0,r = 0,b = 0,l=0))+
    labs(tag = "A")

Figure_1A 
```

### Figure S1 (equivalent to Fig1A normalized)
```{r, fig S1, fig.width=3, fig.height=3 , eval= F}
Figure_1A_norm <- ggplot(reads_wt_qscore_per_position %>% dplyr::group_by(isWT) %>% dplyr::mutate(NormCounts=Counts/sum(Counts)),aes(x=QUAL,y=NormCounts,col=isWT))+
    geom_line(lwd=0.5)+
    xlab(expression(Q[score]))+ylab('Normalized counts')+
    scale_color_manual(breaks=c(TRUE,FALSE), labels=c("Correct","Error"),values=c(green_tr,red_tr))+
    theme_plot+
    theme(legend.direction="vertical", 
          legend.position = c(.7,.7),
          legend.spacing.y = unit(0.05,"line"),
          legend.key.height = unit(1,"line"),
          legend.text=element_text(size=rel(1)), 
          legend.title=element_blank(), 
          legend.background = element_blank())

Figure_1A_norm
ggsave(Figure_1A_norm, filename = paste0(path_save_figures,"Figure1Asuppl.png"),width =8 ,height = 8,dpi='print',units = "cm")
```

### Figure 1B
```{r, fig 1B, fig.width=3, fig.height=3, eval= F}
Figure_1B <- ggplot(reads_wt_errors_by_position %>% 
                        filter(!isWT & Nucleotide !="-" & Position >100 & Position < 150),
                    aes(x=Position,y=Counts,fill=Nucleotide))+
    guides(fill=guide_legend(ncol=2))+
    geom_col()+
    scale_x_continuous(breaks=seq(100,150,by=25))+
    theme_plot+
    theme(legend.direction="horizontal",
          legend.position = c(.3,.85),
          legend.key.size =  unit(.5,units = "line"),
          legend.box = "vertical" ,
          legend.background = element_blank(),
          legend.text=element_text(size=rel(.8)),
          legend.title=element_blank(),
          plot.margin = margin(t=0,r = 0,b = 0,l=0)
    )+
    labs(tag = "B")
Figure_1B
```

### Figure S2 (extended Fig 1B)
```{r, figS2, fig.width=7, fig.height=10, eval= F}
Figure_S2 <- ggplot(reads_wt_errors_by_position %>% filter(!isWT & Nucleotide !="-"),
                                     aes(x=Position, y=Counts,fill=Nucleotide))+
    geom_col()+facet_wrap(~n_plot, ncol=1, scales="free_x")+
    theme_plot+
    theme(strip.background = element_blank(), strip.text.x = element_blank(),
          legend.direction="horizontal", legend.position = "top",
          rect= element_rect(fill = NULL, colour = NULL, size = 1,linetype = "solid",inherit.blank = FALSE)
    )

Figure_S2
ggsave(Figure_S2, file=paste0(path_save_figures,"FigureS2.png"), dpi = 'print', width = 16, height=22,units="cm")
```

Choose example of fits  
```{r, fits examples, eval= F}
tic <- proc.time()
## Example of fit
data_fits <- read.table("3_Analysis/KT7_1700_2100/barcode01_5d4_SINGLe_fits.txt", header = T)  
data_mut <- read.table("3_Analysis/KT7_1700_2100/barcode01_5d4_SINGLe_data.txt", header = T)   

position <-  547             #   Choose position to plot, 547 in manuscript
mut.base <- "A"         #   Choose nucleotide to plot, A in manuscript
str <- "+"

#Filter all data by position and base
data_chosen_pos_base <- data_mut %>%
    filter(pos==position & nucleotide==mut.base & strand==str) %>%
    filter(count>0 | counts.wt>0) %>%
    mutate(ratio=counts.wt/(count+counts.wt),
           ratio_scaled =counts.wt.scaled/(counts.wt.scaled+counts.scaled))
wt  <- as.character(data_chosen_pos_base$wt.base[1])

#Filter all fits' data by position and base
data_fits_chosen_pos_base <- data_fits %>% 
    filter(pos==position & (nucleotide==mut.base | nucleotide==wt) & strand==str)
class_fit_prior <- vapply(data_fits_chosen_pos_base$priors, class)
if(any(class_fit_prior=="numeric")){
    ind_out <- which(class_fit_prior=="numeric");
    data_fits_chosen_pos_base <- data_fits_chosen_pos_base[-ind_out,]
}

#Construct fitting curves
quality_range <- 0:35   #* You'll draw the fitted curves for this Qscores (x-values)

curve_fitted_corrected <- data.frame(quality=quality_range,
                                     value=glm.predict.(quality_range, slope = data_fits_chosen_pos_base$prior_slope,
                                                        intercept =data_fits_chosen_pos_base$prior_intercept))

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Figure 1D
```{r,Fig1D, fig.width=3, fig.height=3, eval= F}
Figure_1D <- ggplot(data_chosen_pos_base)+
    geom_point(aes(x=QUAL, y=ratio_scaled), size=.5)+
    geom_line(data=curve_fitted_corrected, aes(x=quality, y=value), linetype="dashed", color=single_color)+
    scale_y_continuous(breaks=c(0,0.5,1), limits = c(0,1))+scale_x_continuous(limits=c(0,35))+
    xlab(expression(Q[score]))+ylab(expression(N[correct]))+ggtitle("SINGLe")+
    theme_plot+
    theme(plot.title = element_text(size=8))+
    labs(tag = "D")


Figure_1D
```

### Figure 1C 
```{r, Fig1C, fig.width=3, fig.height=3, eval= F}
fit_nopriors <- stats::glm(data_chosen_pos_base$ratio~ data_chosen_pos_base$QUAL,family = "quasibinomial")
fit_nopriors_coeff       <- stats::coefficients(fit_nopriors)
curve_fitted_corrected <- data.frame(quality=quality_range,
                                     value=glm.predict.(quality_range, slope = fit_nopriors_coeff[2],
                                                        intercept =fit_nopriors_coeff[1]))

Figure_1C <-  ggplot(data_chosen_pos_base, aes(x=QUAL, y=ratio))+geom_point()+
    geom_line(data=curve_fitted_corrected, aes(x=quality, y=value), linetype="dashed", color=guppy_color)+
    scale_y_continuous(breaks=c(0,0.5,1),limits = c(0,1))+scale_x_continuous(limits=c(0,35))+
    xlab(expression(Q[score]))+ylab(expression(N[correct]))+ggtitle("Guppy")+
    theme_plot+
    theme(plot.title = element_text(size=8), plot.margin = margin(t=0.5, l=0.5))+
    labs(tag = "C")
Figure_1C
```
    

### Figure 1 composition
```{r, Figure 1, eval= F}
Figure1 <- ( Figure_1A | Figure_1B ) / (Figure_1C | Figure_1D) 
Figure1
ggsave(Figure1, file=paste0(path_save_figures,"Figure1.png"),
       dpi = 'print', width = 8.5, height=8.5,units="cm")

```



## Strand bias analysis
```{r, strand bias, eval= F}
tic <- proc.time()
reads_wt <- read.table(paste0(path_analysis_lib7,"barcode01_5d4_SINGLe_countsPNQ.txt"), header=T) %>%
    dplyr::rename(Strand=strand) %>%
    mutate(isWT = nucleotide==wildtype[pos])

# mean Qscore is similar
tot_counts_for <- sum(reads_wt %>% filter(Strand=="+") %>% select(count))
mean_qscore_forward <- reads_wt %>% filter(Strand=="+") %>% summarise( sum(count*QUAL)/tot_counts_for)

tot_counts_rev <-sum(reads_wt %>% filter(Strand=="-") %>% select(count))
mean_qscore_reverse <- reads_wt %>% filter(Strand=="-") %>% summarise( sum(count*QUAL)/tot_counts_rev)

# Qscore distributions are similar:
counts_hist <- reads_wt %>%
    dplyr::group_by(QUAL,Strand) %>%
    dplyr::summarise(counts=sum(count)) %>%
    ungroup()%>%
    mutate(Proportion = counts/sum(counts)) %>%
    dplyr::rename(Qscore=QUAL) %>%
    mutate(Strand =ifelse(Strand=="+", "Forward","Reverse"))

# Percentage of errors are similar
perc_errors <- reads_wt %>%
    group_by(Strand, isWT) %>%
    summarise(counts = sum(count)) %>%
    mutate(isWT = c("yes","no")[2-as.numeric(isWT)])%>%
    reshape2::dcast(formula = Strand ~  isWT) %>%
    mutate(perc_error = no/(yes+no)*100)

# Percentages of errors per position are different
perc_errors_perpos <- reads_wt %>%
    group_by(Strand, isWT,pos) %>%
    summarise(counts = sum(count)) %>%
    mutate(isWT = c("yes","no")[2-as.numeric(isWT)])%>%
    reshape2::dcast(formula = Strand +pos ~  isWT) %>%
    mutate(perc_error = no/(yes+no)*100)

perc_errors_perpos_cast <- reshape2::dcast(perc_errors_perpos,formula=pos~Strand)%>%
    dplyr::rename(Forward="+", Reverse="-")

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
```

### Figure S11
```{r, figS11, eval= F}
FigS11A <- ggplot(counts_hist, aes(x=Qscore, y=Proportion,linetype=Strand))+
    geom_line()+theme(legend.position = c(.7,.7))
FigS11B <- ggplot(perc_errors_perpos_cast, aes(x=Forward,y=Reverse))+
    geom_point(col=rgb(0,0,0,0.3))+
    scale_x_log10(limits=c(00.1,100))+
    scale_y_log10(limits=c(00.1,100))+
    xlab("Error (%) - forward strand")+
    ylab("Error (%) - reverse strand")


# Thus fits are different
all_fits <- read.table(paste0(path_analysis_lib7,"/barcode01_5d4_SINGLe_fits.txt"), header = T)
forward_fits <- all_fits %>% filter(strand=="+")
reverse_fits <- all_fits %>% filter(strand=="-")
qval <- seq(0,40,by=0.1)
par(mfrow=c(1,4), mar=c(4,4,1,1),mgp=c(2,.7,0), las=1)
first <- T
for(n in c(1278:1281)+4*7-1){
  y_for <- glm.predict.(x=qval, slope = forward_fits$prior_slope[n], intercept = forward_fits$prior_intercept[n])
  y_rev <- glm.predict.(x=qval, slope = reverse_fits$prior_slope[n], intercept = reverse_fits$prior_intercept[n])
  name <- paste(forward_fits$pos[n], forward_fits$nucleotide[n])
  if(first){
    df <- rbind(data.frame(Qscore=qval,Sense="Forward",Fit=y_for,n=name),
                data.frame(Qscore=qval,Sense="Reverse",Fit=y_rev,n=name))
    first <- F
  }else{
      df <- rbind(df,
                  rbind(data.frame(Qscore=qval,Sense="Forward",Fit=y_for,n=name),
                        data.frame(Qscore=qval,Sense="Reverse",Fit=y_rev,n=name))
                  )}
}
plot_fits_examples <- ggplot(df, aes(x=Qscore,y=Fit,lty=Sense))+geom_line()+facet_wrap(~n,nrow=1)

FigS11up <- plot_grid(FigS11A,FigS11B+theme(plot.margin=margin(l=10,r=5.5,t=7,b=5.5)),labels=c("A","B"),hjust = 0.1,vjust=1)
FigS11C <- plot_grid(plot_fits_examples,labels="C",hjust = 0.1)
FigS11 <-
    plot_grid(FigS11up, FigS11C, nrow=2,hjust = -1)
ggsave(paste0(path_save_figures,"FigS11_strandsense.png"),
       width = 7.62,height = 5.68,units = "in",dpi = 300)
FigS11
```

------------------------------------------------------------------------

## Compare signal to noise with p_right cut off

For Guppy and SINGLe, compare how does the signal and noise change if we accept as truth only the nucleotides with a p_right higher than a cut off.


Computation (takes \~ 1min). 
```{r inputs I, eval= F}
tic <- proc.time()
# Identify errors and correct reads (signal and noise)
n <- 0
for(bc_i in kt7_barcodes){
    message('Proccessing ', bc_i, '\n')
    n<- n+1
    #Barcode info
    actual_mutations <- kt_true_mutations%>%                   #mutations present in this barcode
        filter(sequence_id==bc_i)%>% select(nucleotide, position)
    actual_mutations <- actual_mutations[which(actual_mutations$nucleotide!="-"),]

    #Load corrected data
    seqs_single <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, bc_i,"_SINGLe.fastq"))
    aux_seqs <- vapply(as.character(seqs_single),strsplit, split="")
    aux_quals <- 1-as(quality(seqs_single),"NumericList")
    aux_pos <- lapply(aux_seqs, seq_along)
    aux_names <- c(vapply(names(seqs_single),rep, each=1662))
    data_single <- data.frame(nucleotide = unlist(aux_seqs),
                                p_SINGLe=unlist(aux_quals),
                                position=unlist(aux_pos) , 
                            seq_id=aux_names)
    rownames(data_single)<-NULL

    #Load original data
    bf <- Rsamtools::BamFile(paste0(path_analysis_lib7, bc_i,".bam"))
    reads <- Rsamtools::scanBam(bf)
    #keep sequences that start at least at pos_start
    index <- which(reads[[1]]$pos<=pos_start)
    reads[[1]] <- vapply(reads[[1]], function(x){x[index]})

    seqs_guppy <- GenomicAlignments::sequenceLayer(reads[[1]]$seq,
                                reads[[1]]$cigar,to = "reference")
    names(seqs_guppy) <- reads[[1]]$qname
    scores_guppy <- GenomicAlignments::sequenceLayer(reads[[1]]$qual,
                                reads[[1]]$cigar,to = "reference")
    names(scores_guppy) <- reads[[1]]$qname

    #Fill with gaps at the end of sequences shorter than pos_end
    index_short_sequences <- which(width(seqs_guppy)<pos_end)
    for(i in index_short_sequences){
        seqs_guppy[i] <- paste0(as.character(seqs_guppy[i]), 
                                   paste0(rep("-", pos_end-width(seqs_guppy[i])),
                                          collapse = ""),
                                   collapse = "")
        scores_guppy[i] <- paste0(as.character(scores_guppy[i]),
                                    paste0(rep("-", pos_end-width(scores_guppy[i])),
                                           collapse = ""),
                                    collapse = "")
    }
    seqs_guppy <- subseq(seqs_guppy, start=pos_start+1-reads[[1]]$pos,end=pos_end+1-reads[[1]]$pos)
    scores_guppy <- subseq(scores_guppy, start=pos_start+1-reads[[1]]$pos,end=pos_end+1-reads[[1]]$pos)
   
    aux_seqs <- vapply(as.character(seqs_guppy),strsplit, split="")
    aux_quals <- 1-as(scores_guppy,"NumericList")
    aux_pos <- lapply(aux_seqs, seq_along)
    aux_names <- c(vapply(names(seqs_guppy),rep, each=1662))
    data_guppy <- data.frame(seq_id=aux_names,
                            position=unlist(aux_pos), 
                            nucleotide = unlist(aux_seqs),
                            p_Guppy=unlist(aux_quals)
                            )
    rownames(data_guppy)<-NULL

    data <- full_join(data_single,data_guppy, by=c("seq_id","nucleotide","position")) %>%
        mutate(isWT = nucleotide==wildtype[position]) %>%
        filter(isWT==0 & nucleotide!="-") %>%
        group_by(position, nucleotide, p_Guppy,p_SINGLe) %>%
        dplyr::summarise(counts=n())
    
    signal_data_aux <- left_join(actual_mutations, data,by = c("nucleotide", "position"))
    noise_data_aux  <- anti_join(data, actual_mutations,by = c("nucleotide", "position"))

    if(nrow(signal_data_aux)+nrow(noise_data_aux) != nrow(data)){stop('Check joins of data')}

    #Store data
    if(n==1){
        signal_data <- signal_data_aux
        noise_data  <- noise_data_aux
    }else{
        signal_data <- rbind(signal_data,signal_data_aux)
        noise_data  <- rbind(noise_data, noise_data_aux)
    }
    rm(signal_data_aux, noise_data_aux, seqs_single, seqs_guppy, data, aux_seqs, aux_quals, aux_pos)
}


#Compute signal and noise for different cut-offs
p_cutoff_vec <- sort(c(0,10^seq(-10,-1, by=.5),seq(0.1,.99, by=0.01)))

signal_tp <- data.frame(matrix(NA, ncol=5, nrow=length(p_cutoff_vec)))
colnames(signal_tp) <- c("pcutoff", "Guppy", "SINGLe", "Guppy - weighted", "SINGLe - weighted")
signal_fn <- noise_tn <- noise_fp <- signal_tp

noise_data <- noise_data%>% ungroup()
signal_data <- signal_data %>% ungroup()
for(i in seq_along(p_cutoff_vec)){
    p_cutoff <- p_cutoff_vec[i]

    signal_tp_aux <-  signal_data %>%
        dplyr::summarise(counts_guppy = sum(counts[p_Guppy       >= p_cutoff]),
                         counts_single = sum(counts[p_SINGLe >= p_cutoff]),
                         counts_guppy_w = sum( (counts*p_Guppy)       [p_Guppy       >= p_cutoff]),
                         counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe >= p_cutoff])
        )

    signal_fn_aux <-  signal_data %>%
        dplyr::summarise(counts_guppy = sum(counts[p_Guppy       < p_cutoff]),
                         counts_single = sum(counts[p_SINGLe < p_cutoff]),
                         counts_guppy_w = sum( (counts*p_Guppy)       [p_Guppy       < p_cutoff]),
                         counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe < p_cutoff])
        )

    noise_fp_aux <-  noise_data %>%
        dplyr::summarise(counts_guppy = sum(counts[p_Guppy       >= p_cutoff]),
                         counts_single = sum(counts[p_SINGLe >= p_cutoff]),
                         counts_guppy_w = sum( (counts*p_Guppy)       [p_Guppy       >= p_cutoff]),
                         counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe >= p_cutoff])
        )

    noise_tn_aux <-  noise_data %>%
        dplyr::summarise(counts_guppy = sum(counts[p_Guppy       < p_cutoff]),
                         counts_single = sum(counts[p_SINGLe < p_cutoff]),
                         counts_gupy_w = sum( (counts*p_Guppy)       [p_Guppy       < p_cutoff]),
                         counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe < p_cutoff])
        )

    signal_tp[i,] <- c(p_cutoff, unlist(signal_tp_aux))
    noise_fp [i,] <- c(p_cutoff, unlist(noise_fp_aux))
    signal_fn[i,] <- c(p_cutoff, unlist(signal_fn_aux))
    noise_tn [i,] <- c(p_cutoff, unlist(noise_tn_aux))
}


aux_tp <- melt(signal_tp, id.vars = "pcutoff", value.name = "signal_tp")
aux_fp <- melt(noise_fp, id.vars = "pcutoff", value.name = "noise_fp")
aux_fn <- melt(signal_fn, id.vars = "pcutoff", value.name = "signal_fn")
aux_tn <- melt(noise_tn, id.vars = "pcutoff", value.name = "noise_tn")
rates <- aux_tp %>% full_join(aux_fp,by = c("pcutoff", "variable"))%>% full_join(aux_fn,by = c("pcutoff", "variable"))%>%full_join(aux_tn,by = c("pcutoff", "variable"))
rates <- rates %>%
    mutate(ratio = signal_tp / noise_fp)%>%
    mutate(tpr   = signal_tp / (signal_tp + signal_fn))%>%
    mutate(fpr   = noise_fp /  (noise_fp + noise_tn))

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```


### Figure 2A and 2B

```{r Fig2A 2B, fig.width=3, fig.height=3, eval= F}
Figure_2A <- ggplot(rates %>% filter(variable=="Guppy" | variable=="SINGLe"), aes(y=tpr,x=fpr, col=variable))+
    geom_line()+
    scale_color_manual(values=c(guppy_color,single_color), breaks = c("Guppy","SINGLe"))+
    scale_x_continuous(breaks=c(0,0.5,1), limits = c(0,1))+xlab("FPR")+
    scale_y_continuous(breaks=c(0,0.5,1), limits = c(0,1))+ylab("TPR")+
    theme_plot+
    theme(legend.title = element_blank(), legend.position = c(0.6,0.2),legend.spacing.y = unit(0,"cm"),
          legend.key.size = unit(0.5,"lines"), plot.margin = unit(c(0,5.5,5.5,5.5),"points"))+
    labs(tag = "A")

Figure_2B <-  ggplot(rates %>% filter(variable=="Guppy - weighted" | variable=="SINGLe - weighted") %>% mutate(variable = ifelse(variable == "Guppy - weighted", "Guppy", "SINGLe")), aes(pcutoff, y=ratio, col=variable))+
    geom_line()+
    scale_y_continuous(breaks=seq(1,4))+ylab("Signal / Noise")+
    scale_x_continuous(breaks=c(0,0.5,1), limits = c(0,1))+xlab(expression("Cut off on p"["right"]))+
    scale_color_manual(values=c(guppy_color,single_color), breaks=c("Guppy", "SINGLe"), name="")+
    theme_plot+theme(legend.position = c(0.3,0.85),legend.spacing.y = unit(0,"cm"),
                                legend.key.size = unit(0.5,"lines"), plot.margin = unit(c(0,5.5,5.5,5.5),"points"))+
    labs(tag = "B")

Figure_2A
Figure_2B
```

### Figure S5 (similar to Fig 2B)

```{r, FigS5, fig.width=3, fig.height=3, eval= F}
Figure_Sup3 <- ggplot(rates %>% filter(variable=="Guppy" | variable=="SINGLe"), aes(pcutoff, y=ratio, col=variable))+
    geom_line()+
    scale_y_continuous(breaks=seq(1,4))+ylab("Signal / Noise")+
    scale_x_continuous(breaks=c(0,0.5,1), limits = c(0,1))+xlab(expression("Cut off on p"["right"]))+
    scale_color_manual(values=c(guppy_color,single_color), breaks=c("Guppy", "SINGLe"), name="")+
    theme_plot+theme(legend.position = c(0.3,0.85),legend.spacing.y = unit(0,"cm"),
                                legend.key.size = unit(0.5,"lines"), plot.margin = unit(c(0,5.5,5.5,5.5),"points"))
Figure_Sup3
```

------------------------------------------------------------------------

## Convergence of consensus
Compute convergence of consensus. It takes long. 

General inputs
```{r, eval= F}
wildtype_homopolymers <-
    c(F, wildtype[2:length(wildtype)] == wildtype[1:(length(wildtype)-1)]) |
    c(wildtype[1:(length(wildtype)-1)] == wildtype[2:length(wildtype)] ,F)
pos_wt_homopolymers <- which(wildtype_homopolymers)
```



* Consensus by SINGLe, Guppy and No Weights, not modifying homopolymers
```{r, cons subsets single, eval= F}
tic <- proc.time()
for(i in 1:7){

   #Load corrected data
    seqs_single <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe.fastq"))
    aux_seqs <- vapply(as.character(seqs_single),strsplit, split="")
    aux_quals <- 1-as(quality(seqs_single),"NumericList")
    aux_pos <- lapply(aux_seqs, seq_along)
    aux_names <- c(vapply(names(seqs_single),rep, each=1662))
    data_single <- data.frame(nucleotide = unlist(aux_seqs),
                                p_SINGLe=unlist(aux_quals),
                                position=unlist(aux_pos) , 
                            SeqID=aux_names)
    rownames(data_single)<-NULL
    
    seqs_original <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe_original.fastq"))
    aux_seqs <- vapply(as.character(seqs_original),strsplit, split="")
    aux_quals <- 1-as(quality(seqs_original),"NumericList")
    aux_pos <- lapply(aux_seqs, seq_along)
    aux_names <- c(vapply(names(seqs_original),rep, each=1662))
    data_guppy <- data.frame(SeqID=aux_names,
                            position=unlist(aux_pos), 
                            nucleotide = unlist(aux_seqs),
                            p_Guppy=unlist(aux_quals)
                            )
    rownames(data_guppy) <- NULL

    data <- full_join(data_single,data_guppy, by=c("SeqID","nucleotide","position")) %>%
        mutate(isWT = nucleotide==wildtype[position]) %>%
        mutate(p_SINGLe=ifelse(is.na(p_SINGLe), 1, p_SINGLe))%>%
        mutate(p_noweights=1)

    #Compute consensus for each barcode using all sequences available and using the three available methods
    file_out_consensus<- paste0(path_analysis_lib7,kt7_barcodes[i],"_ConsensusBySINGLE.txt")
    if(file.exists(file_out_consensus)){file.remove(file_out_consensus)}
    cat("Barcode\tnseqs\trepetition\tmutation\n", file=file_out_consensus,append=F)
    pb <- txtProgressBar(0,length(subsets)*n_repetitions,style=3)
    counter <- 0
    bc_seqsID <- unique(data$SeqID)
    for( s in subsets){
        for (r in 1:n_repetitions) {
            subset_bc_reads <- data %>% 
                filter(SeqID %in% sample(bc_seqsID, s))

            cons_single <- weighted_consensus(df = subset_bc_reads %>%
                                                  select(nucleotide,p_SINGLe,position), 
                                           cutoff_prob = 0)
            cons_guppy  <- weighted_consensus(df = subset_bc_reads %>% 
                                                  select(nucleotide,p_Guppy,position),
                                           cutoff_prob = 0)
            cons_noweight <- weighted_consensus(df = subset_bc_reads %>%
                                                    select(nucleotide,p_noweights,position),
                                            cutoff_prob = 0)
            muts_single   <- detect_mutations(biostr_to_char(cons_single), wildtype)
            muts_guppy    <- detect_mutations(biostr_to_char(cons_guppy), wildtype)
            muts_noweight <- detect_mutations(biostr_to_char(cons_noweight), wildtype)

            n_single <- length(muts_single)
            n_guppy <- length(muts_guppy)
            n_noweight <- length(muts_noweight)
            n_tot <- n_single+n_guppy+n_noweight
            df <- data.frame(barcode = rep(kt7_barcodes[i],n_tot),
                            nseqs=s,repetition=r,
                            mutation=c(muts_single,muts_guppy,muts_noweight),
                            method = c(rep("SINGLe", n_single),
                                       rep("Guppy",n_guppy), 
                                       rep("No weights",n_noweight)))
            
            write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus)
            counter <- counter+1
            setTxtProgressBar(pb,counter)
        }
    }

}
rm(data)

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

* Consensus by SINGLe, Guppy and No Weights, sorting homopolymers
```{r,cons subsets single homopol, eval= F}
tic <- proc.time()
for(i in 1:7){

  #Load corrected data
    seqs_single <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe.fastq"))
    aux_seqs <- vapply(as.character(seqs_single),strsplit, split="")
    aux_quals <- 1-as(quality(seqs_single),"NumericList")
    aux_pos <- lapply(aux_seqs, seq_along)
    aux_names <- c(vapply(names(seqs_single),rep, each=1662))
    data_single <- data.frame(nucleotide = unlist(aux_seqs),
                                p_SINGLe=unlist(aux_quals),
                                position=unlist(aux_pos) , 
                            SeqID=aux_names)
    rownames(data_single)<-NULL
    
    seqs_original <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe_original.fastq"))
    aux_seqs <- vapply(as.character(seqs_original),strsplit, split="")
    aux_quals <- 1-as(quality(seqs_original),"NumericList")
    aux_pos <- lapply(aux_seqs, seq_along)
    aux_names <- c(vapply(names(seqs_original),rep, each=1662))
    data_guppy <- data.frame(SeqID=aux_names,
                            position=unlist(aux_pos), 
                            nucleotide = unlist(aux_seqs),
                            p_Guppy=unlist(aux_quals)
                            )
    rownames(data_guppy) <- NULL

    data <- full_join(data_single,data_guppy, by=c("SeqID","nucleotide","position")) %>%
        mutate(isWT = nucleotide==wildtype[position]) %>%
        mutate(p_SINGLe=ifelse(is.na(p_SINGLe), 1, p_SINGLe))%>%
        mutate(p_noweights=1)
    


    #Compute consensus for each barcode using all sequences available and using the three available methods
    file_out_consensus<- paste0(path_analysis_lib7,kt7_barcodes[i],"_ConsensusBySINGLE_rmHomopolforVCS.txt")
    if(file.exists(file_out_consensus)){file.remove(file_out_consensus)}
    cat("Barcode\tnseqs\trepetition\tmutation\n", file=file_out_consensus,append=F)
    pb <- txtProgressBar(0,length(subsets)*n_repetitions,style=3)

    counter <- 0
    bc_seqsID <- unique(data$SeqID)
    for( s in subsets){
        for (r in 1:n_repetitions) {
            subset_bc_reads <- data %>% 
                filter(SeqID %in% sample(bc_seqsID, s))

            index_homopolymers <- which(subset_bc_reads$position %in% pos_wt_homopolymers & subset_bc_reads$nucleotide=="-")
            n_memory <- c()
            for(n in index_homopolymers){
                if(n %in% n_memory){next()}
                hp_pos <- subset_bc_reads$position[n]
                hp_pos_ref <- wt_homopolymers[vapply(wt_homopolymers, function(x){hp_pos %in% x})][[1]]
                hp_vec_logical <- hp_pos_ref==hp_pos
                hp_vec_length <- length(hp_vec_logical)
                hp_vec <- rep(NA,hp_vec_length)
                ind_aux <- which(hp_vec_logical)
        
                hp_vec[1:ind_aux] <- rev(n+1-1:ind_aux)
                if(ind_aux<hp_vec_length){
                    hp_vec[(ind_aux+1):hp_vec_length] <- n+c(1:(hp_vec_length-ind_aux))
                }
                n_memory <- hp_vec
        
                data_aux <- subset_bc_reads[hp_vec,]
                if(all(data_aux$nucleotide=="-")){next()}
        
                pos <- data_aux$position
                nucl <- data_aux$nucleotide
                ind_order <- c(which(nucl!="-"),which(nucl=="-"))
        
                subset_bc_reads$position[hp_vec] <- pos[order(ind_order)]
        
                if(length( which(subset_bc_reads$position %in% pos_wt_homopolymers &
                                 subset_bc_reads$nucleotide=="-"))<length(index_homopolymers)){
                    cat("n=",n,"\n")
                    stop()
                }
            }
            subset_bc_reads <- subset_bc_reads %>% arrange(SeqID,position)
    
            cons_single <- weighted_consensus(df = subset_bc_reads %>%
                                                  select(nucleotide,p_SINGLe,position), 
                                           cutoff_prob = 0)
            cons_guppy  <- weighted_consensus(df = subset_bc_reads %>% 
                                                  select(nucleotide,p_Guppy,position),
                                           cutoff_prob = 0)
            cons_noweight <- weighted_consensus(df = subset_bc_reads %>%
                                                    select(nucleotide,p_noweights,position),
                                            cutoff_prob = 0)
            muts_single   <- detect_mutations(biostr_to_char(cons_single), wildtype)
            muts_guppy    <- detect_mutations(biostr_to_char(cons_guppy), wildtype)
            muts_noweight <- detect_mutations(biostr_to_char(cons_noweight), wildtype)

            n_single <- length(muts_single)
            n_guppy <- length(muts_guppy)
            n_noweight <- length(muts_noweight)
            n_tot <- n_single+n_guppy+n_noweight
          if(n_tot==0){
               df <- data.frame(barcode = rep(kt7_barcodes[i],3),
                            nseqs=s,
                            repetition=r,
                            mutation="wildtype",
                            method = c("SINGLe", "Guppy","No weights"))
       
          }else{
            df <- data.frame(barcode = rep(kt7_barcodes[i],n_tot),
                            nseqs=s,
                            repetition=r,
                            mutation=c(muts_single,muts_guppy,muts_noweight),
                            method = c(rep("SINGLe", n_single),rep("Guppy",n_guppy), rep("No weights",n_noweight)))
          }
            write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus)
            counter<- counter+1
            setTxtProgressBar(pb,counter)
        }
    }

}

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

* Consensus by Nanopolish and Racon+Medaka:

Prepare fast5 files for nanopolish. Please indicate proper paths indicated in capital letters
```{r, fast5 for nanopolish, eval= F}
tic <- proc.time()
PATH_TO_FAST5_FILES<- "2_NanoporeRawReads/SmallSet_fast5/" #PATH_TO_FAST5_FILES location of nanopore reads
dir.create(paste0(path_analysis_lib7,"fast5files/")) # individual fast5 files will be stored there temporally 

# Split to individual files
split_fast5 <- paste0("multi_to_single_fast5 -i ",PATH_TO_FAST5_FILES, " -s ",path_analysis_lib7, "fast5files/")  
system2(split_fast5)

input_table_barcodes <- read.table(paste0(path_analysis_lib7,"barcoding_summary.txt"),header=T)
input_table_barcodes <- input_table_barcodes %>%
    filter(barcode_arrangement %in% kt7_barcodes) %>%
    select(read_id,barcode_arrangement)


# Split individual files into folders according to barcode
individual_files <- dir(pattern="*.fast5",path=paste0(path_analysis_lib7,"fast5files/"),recursive=T)
individual_files <- data.frame(file =individual_files)
individual_files$read_id <- vapply(individual_files$file, function(x) { 
    sub(pattern = ".fast5",replacement = "",x=strsplit(x,split="/", fixed=T)[[1]][2])
    })
individual_files <- left_join(individual_files,input_table_barcodes)

for(bc in kt7_barcodes){
    dir.create(paste0(path_analysis_lib7,"fast5files/", bc))
    barcode_files <- individual_files %>% filter(barcode_arrangement==bc)
    
    commands <- paste0("mv ",path_analysis_lib7,"fast5files/", barcode_files$file," ", path_analysis_lib7,"fast5files/",bc,"/")
    vapply(commands,system2)
}

# Create one file per barcode
dir.create(paste0(path_analysis_lib7, "fast5files_grouped"))
for(bc in kt7_barcodes){
    dir.create(paste0(path_analysis_lib7, "fast5files_grouped/",bc))
    command <- paste0("single_to_multi_fast5 -i ",path_analysis_lib7,"fast5files/", bc, " -s ", path_analysis_lib7,"fast5files_grouped/",bc, " -f ", bc, " -n ", 1000)
    system2(command)
}

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Create bash file to compute consensus by nanopolish and medaka
```{r, eval= F}
tic <- proc.time()
dir.create(paste0(path_analysis_lib7,"subsets/"))
wd <- getwd()
for(bc in kt7_barcodes){
    cat(bc,"\n")

    store_path <- paste0(path_analysis_lib7,"subsets/")
    fastq_file <- readLines(paste0(path_analysis_lib7, bc,".fastq"))
    name_lines <- grep(pattern="^@",x=fastq_file)
    bash_file <- paste0(path_analysis_lib7,bc, ".sh")
    cat("#!/bin/bash \n",file=bash_file, append=F)
    cat(paste0("cd ", wd,"\n"), file=bash_file,append=T)

    results_racon      <- paste0(path_analysis_lib7,bc,"_ConsensusbyRacon.txt")
    results_medaka     <- paste0(path_analysis_lib7,bc,"_ConsensusbyMedaka.txt")
    nanopolish_results <- paste0(path_analysis_lib7,bc,"_ConsensusbyNanopolish.txt")
    file.create(results_racon,overwrite=TRUE)
    file.create(results_medaka,overwrite=TRUE)
    file.create(nanopolish_results,overwrite=TRUE)
    
    for(ns in subsets){
        cat("\t subset size",ns,"\n")
        if(ns> length(name_lines)){next()}
        for(r in 1:n_repetitions){
            # cat("----------------",ns, r, "\n")
            basename <- paste0(bc,"_",ns,"_",r)
            filename <- paste0(store_path,basename)
            fastq <- paste0(filename,".fastq")

            ## Make file of subset of sequences
            choose_lines <- sample(name_lines, size= ns)
            choose_lines_all <- sort(c(choose_lines,choose_lines+1,choose_lines+2,choose_lines+3))
            writeLines(text = fastq_file[choose_lines_all], con = file(fastq))

            sam        <- paste0(filename,".sam")
            sorted     <- paste0(filename,".sorted.fasta")
            racon      <- paste0(filename,".racon.fasta")
            medaka     <- paste0(filename,"_medaka.fasta")
            nanopolish <- paste0(filename,"_nanopolish.txt")

            ## Run minimap2
            line_minimap <- paste0("minimap2 -ax map-ont ", reference_file, " ", fastq," > ", sam)

            ## Run racon
            line_racon <- paste0("racon ",fastq," ",sam, " ",reference_file, " >> ", racon)
            line_results_racon <- paste0("echo '>",basename, "' >> ",results_racon)
            line_results_racon2 <- paste0("tail -n 1 ",racon, " >> ",results_racon)
            line_remove_racon <- paste0("rm ", racon,".fai ",racon, ".mmi ",racon  )

            ## Run Medaka
            line_medaka <- paste0("medaka_consensus -i ",fastq," -d ",racon,"  -o ", medaka," -t 4 ")
            line_results_medaka <- paste0("echo '>",basename, "' >> ",results_medaka)
            line_results_medaka2 <- paste0("tail -n 1 ",filename,"_medaka.fasta/consensus.fasta", " >> ",results_medaka)
            line_remove_medaka <- paste0("rm -r ",filename,"_medaka.fasta/")

            ## Run nanopolish
            line_nanopolish_index  <- paste0("nanopolish index -d ", path_analysis_lib7, "fast5files_grouped/",bc," ",fastq)
            line_nanpolish_sort    <- paste0(" samtools sort ",sam," -o ",filename,".sorted.fastq -T temporal.tmp ")
            line_nanopolish_index2 <- paste0("samtools index ", filename,".sorted.fastq")
            line_nanopolish_variants <- paste0("nanopolish variants --consensus -o ", nanopolish, " -r ",fastq," -b ",filename,".sorted.fastq -g ", reference_file)
            line_results_nanopolish <- paste0("awk '!/^#/ {print \"",filename,"\\t\" $0}' ",nanopolish, " >> ",nanopolish_results)
            line_remove_nanopolish <- paste0("rm ", filename,".fastq.index* ", filename,".sorted* ", nanopolish, " ", filename,".sam ")

            cat(line_minimap,
                line_racon,
                line_results_racon,
                line_results_racon2,
                line_medaka,
                line_results_medaka,
                line_results_medaka2,
                line_nanopolish_index,
                line_nanpolish_sort,
                line_nanopolish_index2,
                line_nanopolish_variants,
                line_results_nanopolish,
                line_remove_racon,
                line_remove_medaka,
                line_remove_nanopolish,
                file=bash_file, sep="\n", append=T)
        }
    }
}

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Run barcodeXX.sh bash files in a terminal with conda activate medaka:

```{bash, eval= F}
cd 3_Analysis/KT7_1700_2100
chmod +x *sh

conda activate medaka

nohup ./barcode05.sh > bc05_out.log 2>bc05_err.log &
nohup ./barcode06.sh > bc06_out.log 2>bc06_err.log &
nohup ./barcode07.sh > bc07_out.log 2>bc07_err.log &
nohup ./barcode08.sh > bc08_out.log 2>bc08_err.log &
nohup ./barcode09.sh > bc09_out.log 2>bc09_err.log &
nohup ./barcode10.sh > bc10_out.log 2>bc10_err.log &
nohup ./barcode11.sh > bc11_out.log 2>bc11_err.log &

```

Run NextPolish

```{r, nextpolish, eval= F}
tic <- proc.time()
create_run_cfg <- function(run_cfg_file, ref_fasta, workdir){
    cat("[General]\n", file=run_cfg_file, append=FALSE)
    cat('job_type = local\n', file=run_cfg_file, append=TRUE)
    cat('job_prefix = nextPolish\n', file=run_cfg_file, append=TRUE)
    cat('task = best\n', file=run_cfg_file, append=TRUE)
    cat('rewrite = yes\n', file=run_cfg_file, append=TRUE)
    cat('rerun = 3\n', file=run_cfg_file, append=TRUE)
    cat('parallel_jobs = 2\n', file=run_cfg_file, append=TRUE)
    cat('multithread_jobs = 4\n', file=run_cfg_file, append=TRUE)
    cat('genome =', ref_fasta,' #genome file\n', file=run_cfg_file, append=TRUE)
    cat('genome_size = auto\n', file=run_cfg_file, append=TRUE)
    cat('workdir =',workdir, "\n",file=run_cfg_file, append=TRUE, sep="")
    cat('   polish_options = -p {multithread_jobs}\n', file=run_cfg_file, append=TRUE)

    cat('[lgs_option]\n', file=run_cfg_file, append=TRUE)
    cat('lgs_fofn =lgs.fofn\n', file=run_cfg_file, append=TRUE)
    cat('lgs_options = -min_read_len 1k -max_depth 100\n', file=run_cfg_file, append=TRUE)
    cat('lgs_minimap2_options = -x map-ont\n', file=run_cfg_file, append=TRUE)
    return(0)
}

dir.create(paste0(path_analysis_lib7,"nextpolish/"))
system2(paste0("cp ", reference_file, " ",path_analysis_lib7,"nextpolish/"))
for(bc in kt7_barcodes){
    save_file <- paste0(path_analysis_lib7,bc, "_ConsensusbyNextPolish.fasta")

    for(ns in subsets){
        cat("\t",ns,"\n")
        if(ns> length(name_lines)){next()}
        for(r in 1:n_repetitions){
            cat("----------------",ns, r, "\n")

            #Create file with subset of sequences
            subset_name <- paste0(bc,"_",ns, "_",r)
            filename_subset <- paste0(run_folder,subset_name,".fastq")
           
            ## Run nextpolish
            system2(paste0("mv ", path_analysis_lib7,"subsets/", subset_name, ".fastq ", path_analysis_lib7,"nextpolish/"))
            system2(paste0("echo '", subset_name,".fastq' >", path_analysis_lib7,"nextpolish/lgs.fofn"))
            create_run_cfg(run_cfg_file=paste0(path_analysis_lib7,"nextpolish/run.cfg"),
                           ref_fasta="KTrefORF_1662.fasta",
                           workdir=paste0(subset_name,"/"))
            system2(paste0("cd ",path_analysis_lib7,"nextpolish/ ; nextPolish run.cfg"))
            system2(paste0("sed -i '1s/.*/>",subset_name,"'/ ",path_analysis_lib7,"nextpolish/",subset_name, "/genome.nextpolish.fasta "))
            system2(paste0("cat ",path_analysis_lib7,"nextpolish/", subset_name, "/genome.nextpolish.fasta >>", save_file))
            system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/", subset_name))
            system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/", subset_name,".fastq"))
            system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/lgs.fofn"))
            system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/run.cfg"))
        }
    }
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Identify mutations
```{r,identify mutations, eval= F}
tic <- proc.time()
df_nextpolish <- data.frame(
    Barcode=NA,
    Nreads=NA,
    repetition=NA,
    mutation=NA
)
for(bc in kt7_barcodes){
    np<- readDNAStringSet(paste0(path_analysis_lib7,bc, "_ConsensusbyNextPolish.fasta"))
    
    ali <- pairwiseAlignment(pattern = np, subject = wildtypye_bstr)
    for(i in seq_along(ali)){
        ref <- biostr_to_char(subject(ali[i]))
        seq <- biostr_to_char(pattern(ali[i]))
        ind_noI <- which(ref!="-")
        ref <- ref[ind_noI]
        seq <- seq[ind_noI]
        mutations <- detect_mutations(seq,ref)
        ids <- str_split(names(np)[i], "_")[[1]]
        if(length(mutations)==0){next()}
        df_aux <- data.frame(Barcode=ids[1], 
                             Nreads=as.numeric(str_remove(ids[2],"s")), 
                             repetition=as.numeric(str_remove(ids[3],"r")), 
                             mutation = mutations)
        df_nextpolish <- rbind(df_nextpolish,df_aux)
    }
}
df_nextpolish <- df_nextpolish %>% filter(!is.na(Barcode))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```


Re-shape results to a unique matrix
```{r, eval= F}
tic <- proc.time()
# Racon
racon_mismatches <- data.frame(Barcode=NA,Nreads=NA,repetition=NA,Position=NA,base.original=NA,base.mutated=NA,mutation=NA)
for(bc in kt7_barcodes){
    racon_consensus <- readDNAStringSet(paste0(path_analysis_lib7,bc,"_ConsensusbyRacon.txt"))
    for(i in seq_along(racon_consensus)){
        name <- names(racon_consensus)[i]
        info <- strsplit(split="_",x=name)[[1]]

        racon <- racon_consensus[[i]]
        ali <- pairwiseAlignment(wildtype_bstr,racon)
        ref <- strsplit(as.character(pattern(ali)), split="")[[1]]
        read <- strsplit(as.character(subject(ali)),split="")[[1]]
        index <- which(ref!=read)
        mismatches <- paste0(ref[index],index,read[index])
        if(length(mismatches)>0){
            df_aux <- data.frame(Barcode=bc,Nreads=info[2],repetition=info[3],Position=index,
                                 base.original=ref[index],base.mutated=read[index],mutation=mismatches)
            racon_mismatches <- racon_mismatches %>% rbind(df_aux)
        }
    }
}
racon_mismatches <- racon_mismatches %>% filter(!is.na(Barcode)) %>% mutate(method="Racon",homopolymers=NA)

# Medaka
medaka_mismatches <- data.frame(Barcode=NA,Nreads=NA,repetition=NA,Position=NA,base.original=NA,base.mutated=NA,mutation=NA)
for(bc in kt7_barcodes){
    medaka_consensus <- readDNAStringSet(paste0(path_analysis_lib7,bc,"_ConsensusbyMedaka.txt"))
    for(i in seq_along(medaka_consensus)){
        name <- names(medaka_consensus)[i]
        info <- strsplit(split="_",x=name)[[1]]

        medaka <- medaka_consensus[[i]]
        ali <- pairwiseAlignment(wildtype_bstr,medaka)
        ref <- strsplit(as.character(pattern(ali)), split="")[[1]]
        read <- strsplit(as.character(subject(ali)),split="")[[1]]
        index <- which(ref!=read)
        mismatches <- paste0(ref[index],index,read[index])
        if(length(mismatches)>0){
            df_aux <- data.frame(Barcode=bc,Nreads=info[2],repetition=info[3],Position=index,
                                 base.original=ref[index],base.mutated=read[index],mutation=mismatches)
            medaka_mismatches <- medaka_mismatches %>% rbind(df_aux)
        }
    }
}
medaka_mismatches <- medaka_mismatches %>% filter(!is.na(Barcode))  %>% mutate(method="Medaka",homopolymers=NA)

# Nanopolish
nanopolish_mismatches <- data.frame(Barcode=NA,Nreads=NA,repetition=NA,Position=NA,base.original=NA,base.mutated=NA)
for(bc in kt7_barcodes){

    results_nanopolish <- system2(paste0("awk \'/^[^#]/ \' ",bc,"_ConsensusbyNanopolish.txt"), intern = T)
    if(length(results_nanopolish)==0){next()}
    aux_nanopolish <- do.call(rbind,vapply(results_nanopolish, strsplit, split="\t"))
    colnames(aux_nanopolish) <- rownames(aux_nanopolish) <- NULL
    aux_nanopolish <- aux_nanopolish[,c(1,3,5,6)]
    info <- vapply(aux_nanopolish[,1],strsplit,split="_")

    if(nrow(aux_nanopolish)>0){
        aux_nanopolish <- data.frame(Barcode=bc,
                                     Nreads=vapply(info, function(x){x[2]}),
                                     repetition=vapply(info, function(x){x[3]}),
                                     Position=aux_nanopolish[,2],
                                     base.original=aux_nanopolish[,3],base.mutated=aux_nanopolish[,4])
        nanopolish_mismatches <- nanopolish_mismatches %>% rbind(aux_nanopolish)
    }
}

nanopolish_mismatches <- nanopolish_mismatches %>% filter(!is.na(Barcode)) %>%
    mutate(Nreads=as.numeric(Nreads), repetition=as.numeric(repetition), Position=as.numeric(Position))

#remove insertions
ind_insertions <- nchar(nanopolish_mismatches$base.mutated)>1
nanopolish_mismatches$base.mutated[ind_insertions] <- vapply(nanopolish_mismatches$base.mutated[ind_insertions] , substr,1,1)
nanopolish_mismatches <- nanopolish_mismatches %>% filter(base.mutated!=base.original)

#split deletions
index_aux <- which(nchar(nanopolish_mismatches$base.original)==2)
nanopolish_mismatches$base.original[index_aux] <- substr(nanopolish_mismatches$base.original[index_aux],2,2)
nanopolish_mismatches$base.mutated[index_aux] <- "-"
nanopolish_mismatches$Position[index_aux] <- as.numeric(nanopolish_mismatches$Position[index_aux])+1

index_aux_3 <- which(nchar(nanopolish_mismatches$base.original)==3)
aux_df_1 <- nanopolish_mismatches[index_aux_3,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"

aux_df_2 <- nanopolish_mismatches[index_aux_3,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"

nanopolish_mismatches <- nanopolish_mismatches[-index_aux_3,]
nanopolish_mismatches <- nanopolish_mismatches %>% rbind(aux_df_1) %>% rbind(aux_df_2) %>% arrange(Barcode,Nreads,repetition,Position) %>% mutate(method="Nanopolish",homopolymers=NA, mutation=paste0(base.original,Position,base.mutated))

#SINGLe

single_mismatches <- lapply(kt7_barcodes,
                              function(x){
                                  name=paste0(x,"_ConsensusBySINGLE.txt")
                                  y = read.table(name, header=T,row.names=NULL)
                                  return(y)})
colnames(single_mismatches) <- c("Barcode","Nreads","repetition","mutation","method")
single_mismatches$homopolymers <- "original"


single_mismatches_sortHP <- lapply(kt7_barcodes,
                                   function(x){
                                       name=paste0(x,"_ConsensusBySINGLE_50reps_rmHomopolforVCS.txt")
                                       y = read.table(name, header=T,row.names=NULL)
                                       return(y)})
colnames(single_mismatches_sortHP) <- c("Barcode","Nreads","repetition","mutation","method")
single_mismatches_sortHP$homopolymers = "sorted_at_VCS"

single_mismatches <- rbind(single_mismatches_sortHP,single_mismatches) %>%
    mutate(base.original = str_sub(mutation,1,1),
           base.mutated=str_sub(mutation,-1,-1),
           position=str_sub(mutation,start=2,end=-2)) %>%
    mutate(position = as.numeric(position)) %>% dplyr::rename(Position=position)  %>%
    mutate(method = recode(method,!!!setNames(c("SINGLe","Guppy","No weights"),c("single","nanopore","no_weights"))))


df_nextpolish  <- df_nextpolish %>%
    mutate(method="NextPolish") %>%
    mutate(homopolymers=NA) %>%
    mutate(base.original = str_sub(mutation, 1,1)) %>%
    mutate(base.mutated = str_sub(mutation, -1,-1)) %>%
    mutate(Position = str_sub(mutation,2,-2)) 

#Save
kt7_mismatches_subsets_allMethods <- rbind(medaka_mismatches,nanopolish_mismatches,single_mismatches, df_nextpolish)
write.table(kt7_mismatches_subsets_allMethods, file=paste0(path_analysis_lib7, "KT7_ConsensusConvergence.txt"))

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

* Consensus by NextPolish

Load results of previous chunks
```{r, eval= F}
kt7_mismatches_subsets_allMethods <- read.table(paste0(path_analysis_lib7, "KT7_ConsensusConvergence.txt"), header=T)
```

Compute averages
```{r, eval= F}
tic <- proc.time()
consensus_by_method <- kt7_mismatches_subsets_allMethods %>%
    mutate(variant = recode(Barcode, !!!kt7_bc2mut))

#correct equivalent deletion
ind_aux <- which(consensus_by_method$mutation=="G489-")
consensus_by_method$mutation[ind_aux] <- "G490-"
consensus_by_method$Position[ind_aux] <- 490

consensusStrand_by_method <- consensus_by_method  %>%
    group_by(Barcode,variant,Nreads,repetition,method,homopolymers) %>%
    summarise(consensus = paste(mutation, collapse=" ")) %>%
    left_join(kt7_true_mutants, by="Barcode") %>%
    mutate(correct = consensus==true_consensus)

n_correct_consensus_average <- consensusStrand_by_method %>%
    group_by(Barcode,variant,Nreads,method,homopolymers) %>%
    summarise(total_correct = sum(correct)) %>%
    mutate(success_rate = total_correct/50)


nmismatches_by_method <- consensus_by_method %>%
    group_by(Barcode,variant,Nreads,repetition, method,homopolymers) %>%
    summarise(n_mismatches=n())

average_nmismatches_by_method <- nmismatches_by_method %>%
    group_by(Barcode,variant,Nreads,method,homopolymers) %>%
    summarise(mean=sum(n_mismatches)/50,median=median(n_mismatches))

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
```

### Figure 2C

```{r, fig.width=3, fig.height=3, eval= F}

linetype_vector <- c("solid","solid","dashed","dashed","solid","solid")
names(linetype_vector) <- c("Medaka","Nanopolish","Guppy","No weights","SINGLe","NextPolish")

average_mismatch_to_plot <- average_nmismatches_by_method %>% 
    filter(method !="Racon") %>%
    filter((method=="SINGLe" & homopolymers=="sorted_at_VCS") |
               is.na(homopolymers) | 
               (method!="SINGLe" & homopolymers=="original") )

n_correct_to_plot <- n_correct_consensus_average %>% filter(method !="Racon") %>%
    filter((method=="SINGLe" & homopolymers=="sorted_at_VCS") |is.na(homopolymers) | (method!="SINGLe" & homopolymers=="original") )

Figure_2C <- ggplot(average_mismatch_to_plot %>%filter(Barcode=="barcode09"),
                           aes(x=Nreads, y=mean, col=method,lty=method))+
    geom_point(size=0.5)+geom_line()+
    ylab("Mismatches")+
    xlab("Reads")+xlim(c(0,20))+
    scale_color_manual(values=colors_vector_capital)+
    scale_linetype_manual(values=linetype_vector, guide="none")+
    theme_plot+
    theme(legend.position = "none",plot.margin = unit(c(0,5.5,5.5,5.5),"points"))+
    labs(tag = "C")

Figure_2C
```

### Figure 2D
```{r, eval= F}
Figure_2D <- ggplot(n_correct_to_plot ,
                     aes(x=Nreads,y=success_rate,col=method, lty=method))+
    geom_point(size=1)+
    geom_line()+
    guides(col=guide_legend(ncol=1))+
    geom_hline(aes(yintercept=0.9),col=grey(.5),linetype="dashed")+
    scale_color_manual(values=colors_vector_capital[-6])+
    scale_linetype_manual(values=linetype_vector, guide="none")+
    theme_plot+
    scale_y_continuous(breaks=c(0,.5,1), limits = c(0,1))+
    scale_x_continuous(breaks=c(10,30,50))+
    theme(
          legend.position = c(0.8,0.5),
          legend.box.margin = margin(0,0,0,0),
          legend.box.spacing=unit(1,"pt"))+
    facet_wrap(~variant, ncol=3, dir = "v")+
    labs(tag = "D",x="Reads",y="Success rate",col="Method")

Figure_2D
```

### Figure 2 composition

```{r, fig.width=7, fig.height=10, eval= F}
Figure2 <- grid.arrange(Figure_2A , Figure_2B ,Figure_2C, Figure_2D, 
             layout_matrix=matrix(c(1,2,3,4,4,4), byrow = T,nrow=2), 
             heights=c(1,3))
Figure2
ggsave(Figure2, file=paste0(path_save_figures,"Figure2.png"), dpi = 'print', width = 16, height=20,units="cm")

```

## Sensibility and sensitivity of consensus by method

Compute TP/FP/TN/FN by method
```{r, eval= F}
tic <- proc.time()
kt7_mismatches_subsets_allMethods <- read.table(paste0(path_analysis_lib7, "KT7_ConsensusConvergence.txt"), header=T)

consensus_by_method <- kt7_mismatches_subsets_allMethods %>%
    mutate(variant = recode(Barcode, !!!kt7_bc2mut))

data_mutations_local <- kt_true_mutations %>%
    mutate(wt = wildtype[position]) %>%
    mutate(mutation =paste0(wt,position,nucleotide)) %>%
    dplyr::rename(Barcode=sequence_id)%>%
    select(Barcode, mutation) %>%
    mutate(true_mut =1) %>%
    group_by(Barcode) %>%
    mutate(n_muts = n())

a <- consensus_by_method %>%
    filter(method!="Racon") %>%
    filter((method=="SINGLe" & homopolymers=="sorted_at_VCS") |is.na(homopolymers) | (method!="SINGLe" & homopolymers=="original") )%>%
    select(Barcode, Nreads, repetition, mutation, method) %>%
    left_join(data_mutations_local %>% select(-n_muts), by=c("Barcode", "mutation")) %>%
    mutate(true_mut = ifelse(is.na(true_mut), 0,true_mut)) %>%
    left_join(data_mutations_local %>% select(-true_mut, - mutation) %>% unique(), by=c("Barcode"))%>%
    mutate(variant = recode(Barcode, !!!kt7_bc2mut)) %>%
    select(-Barcode) %>%
    dplyr::rename(Method=method)

class_vec <- c("True mutations", "False mutations", "False wild type", "True wild type", "Detected mutations")
names(class_vec) <- c("true_pos","false_pos","false_neg","true_neg", "total_mut")

sum_a_melt_av <- a %>%
    group_by(variant,Nreads, repetition,Method) %>%
    summarise(total_mut = n(),
              true_pos = sum(true_mut),
              false_pos =sum(1-true_mut),
              n_muts=mean(n_muts)) %>%
    mutate(false_neg = n_muts-true_pos, true_neg=1662-total_mut-false_neg) %>%
    reshape2::melt(c("variant","Nreads","repetition","Method")) %>%
    group_by(variant,Nreads, Method, variable) %>%
    dplyr::summarise(mean_value = mean(value))%>%
    mutate(variable = recode(variable, !!!class_vec))

sum_sens <- sum_a_melt_av %>%
    group_by(Nreads, variant, Method) %>%
    reshape2::dcast(Nreads+ variant+Method~variable) %>%
    mutate(sensitivity = `True mutations`/(`True mutations` + `False wild type`) ) %>%
    mutate(specificity = `True wild type`/(`True wild type` + `False mutations`) )

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Figure S6
```{r, fig.width=7, fig.height=10, eval= F}
Figure_S6A <- ggplot(sum_a_melt_av %>% filter(Nreads <10 & variable != "n_muts" ),
       aes(x=Nreads, y=mean_value, col=Method))+
    geom_line()+
    facet_grid(variable~variant, scales = "free_y", labeller = label_wrap_gen(width=10))+
    xlab("Reads")+ylab("Mean of counts")+
    theme_plot+
    scale_color_manual(values=colors_vector_capital[-6])+
    theme(legend.position = "none")+
    ggtitle(label="A")

Figure_S6B <-ggplot(sum_sens %>% filter(Nreads <10  ),
       aes(x=specificity, y=sensitivity, col=Method))+
    facet_grid(~variant,  labeller = label_wrap_gen(width=10))+
    geom_point()+geom_line()+
    theme_plot+
    scale_color_manual(values=colors_vector_capital[-6])+
    theme(legend.position = "bottom", strip.background = element_blank(),
          strip.text.x = element_blank(),plot.margin = margin(t = 0,r=35,b = 0,l = 10))+
    ggtitle(label="B")+
    scale_x_continuous(breaks=c(0.990,1))+
    scale_y_continuous(breaks=c(0.2,0.6,1.0))


Figure_S6 <- grid.arrange(Figure_S6A , Figure_S6B,
                        layout_matrix=matrix(c(1,2), byrow = T,nrow=2),
                        heights=c(2.5,1))

ggsave(Figure_S6,filename=paste0(path_save_figures,"FigureS6.png"), width=16, height = 18, units = "cm", dpi='print')


```


------------------------------------------------------------------------
### Figure S4 (homopolymers)

Homopolymers analysis
```{r, eval= F}
Figure_S4 <- ggplot(n_correct_consensus_average %>% filter(method %in% c("SINGLe","No weights","Guppy")),
                                    aes(x=Nreads,y=success_rate,col=method,lty=homopolymers))+
    geom_line()+facet_grid(method~variant)+
    ylab("Success rate")+geom_hline(aes(yintercept=0.9),col=grey(.5), lty="dashed")+
    xlab("Reads")+
    scale_color_manual(values=colors_vector_capital[c(1,3,5)],name="Method")+
    scale_linetype_manual(values=c("solid","dashed"),name="Homopolymers",labels=c("Original","Sorted"))+
    scale_y_continuous(breaks=c(0,.5,1), limits = c(0,1))+scale_x_continuous(breaks=c(10,20,30,40,50))+
    theme_plot+
    theme(legend.position = "bottom",legend.spacing.y = unit(0,"line"),
          legend.key.size = unit(1.5,"line"),
          plot.margin = unit(c(0,5.5,5.5,5.5),"points"),
          strip.background = element_rect(fill= "white"))
Figure_S4

ggsave(Figure_S4,filename=paste0(path_save_figures,"FigureS4.png"), width=17, height = 10, units = "cm", dpi='print')

```




------------------------------------------------------------------------
# KlenTaq large library 


### Filenames
```{r}
file_out_consensus<- paste0(path_analysis_lib,"/KTLib_ConsensusSINGLe.txt")
file_out_consensus_topten  <- paste0(path_analysis_lib,"KTLib_ConsensusbySINGLe_toptenBC.txt")
file_consensus_KTLib <- paste0(path_analysis_lib, "KTLib_consensus_table.txt")
```


## Basecall, align, detect barcodes, SINGLe
Basecalling
```{bash LIBbasecall, eval=F}
guppy_basecaller -i 2_NanoporeRawReads/LargeSet_fast5 -s 3_Analysis/LargeSet_SUP -c dna_r9.4.1_450bps_sup.cfg -x 'cuda:0' -r
#Filter sequences by length (1700 to 2100 bp)
mkdir 3_Analysis/KTLib/
awk 'NR % 4 ==2 && length($0) > 1700 && length($0) <2100 {print f  "\n" $0;getline;print;getline;print}  {f=$0}' 3_Analysis/LargeSet_SUP/pass/*fastq > 3_Analysis/KTLib/KTLib_1700_2100.fastq
```

Keep reads with mean Qscore\>15
```{r LIBfilter Q, eval=F}
seqsum_ktlib <- read.table("3_Analysis/LargeSet_SUP/sequencing_summary.txt", header=T) %>%
    filter(passes_filtering & mean_qscore_template>15) %>% 
    select(read_id)

file_ktlib_fastq <- "3_Analysis/KTLib/KTLib_1700_2100.fastq"#LargeSet_1700_2100.fastq"
file_ktlib_q15_fastq  <- "3_Analysis/KTLib/KTLib_1700_2100_Q15.fastq"
nlines <- as.numeric(str_split(system2(paste0("wc -l ",file_ktlib_fastq), intern=T), pattern=" ")[[1]][1])
connectionFile <- file(file_ktlib_fastq,"r")
n<-0
pb <- txtProgressBar(0,nlines,style = 3)
while(n < nlines){
  line <- readLines(connectionFile,n=1)
  id <- substr(line,2,37)
  if(id %in% seqsum_ktlib$read_id){
    cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
    line <- readLines(connectionFile,n=1)
    cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
    line <- readLines(connectionFile,n=1)
    cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
    line <- readLines(connectionFile,n=1)
    cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
  }else{
    line <- readLines(connectionFile,n=3)
  }
    n <- n+4
  setTxtProgressBar(pb,n)
}
 close(connectionFile)
```

Alignment using minimap2
```{bash LIBminimap2, eval=F}
minimap2 -ax map-ont 1_ReferenceFiles/KTB_digested.fasta 3_Analysis/KTLib/KTLib_1700_2100_Q15.fastq > 3_Analysis/KTLib/KTLib_1700_2100_Q15.sam

samtools view -S -b 3_Analysis/KTLib/KTLib_1700_2100_Q15.sam > 3_Analysis/KTLib/KTLib_1700_2100_Q15.bam #TRANSFORM TO BAM
samtools sort 3_Analysis/KTLib/KTLib_1700_2100_Q15.bam -o 3_Analysis/KTLib/KTLib_1700_2100_Q15.sorted.bam #SORT BAM FILE

```

SINGLe correction
```{r LIBSINGLe, eval=F}
tic <- proc.time()
pos_start <- 103
pos_end <- 1764

single_evaluate(bamfile    = "3_Analysis/KTLib/KTLib_1700_2100_Q15.sorted.bam",
                single_fits   = single_train,
                output_file = "3_Analysis/KTLib/KTLib_1700_2100_Q15_SINGLe.fastq",
                refseq_fasta = reference_file,
                pos_start=pos_start,pos_end=pos_end,
                verbose=T,gaps_weights="minimum",save=TRUE, 
                save_original_scores=TRUE)

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```


Barcodes detection. First create bash file.
```{r LIB detect barcodes, eval=F}
tic <-  proc.time()
## Inputs
bc_before   <- "actggc"
bc_after    <- "aagcc"
bc_before   <- toupper(bc_before)
bc_after    <- toupper(bc_after)
l           <- nchar(bc_before)
length_bc   <- 36
length_deltabc <- 2

inFile    <- "KTLib_1700_2100_Q15.fastq"
out.file  <- "KTLib_1700_2100_barcodes.txt"
code.file <- "3_Analysis/KTLib/FindBarcodes.sh"

bc_min <- length_bc-length_deltabc
bc_max <- length_bc+length_deltabc

## All possible variations of barcodes
bc_before_compl <- dna_reverse_complement_string(bc_before)
bc_after_compl <- dna_reverse_complement_string(bc_after)

bc_before_all       <- bc_one_mutation(bc_before)
bc_before_compl_all <- bc_one_mutation(bc_before_compl)
bc_after_all        <- bc_one_mutation(bc_after)
bc_after_compl_all  <- bc_one_mutation(bc_after_compl)

bc_combinations <- rbind(data.frame(before=bc_before,after= bc_after, forw=TRUE),
                         data.frame(before=bc_before_all, after=bc_after, forw=TRUE),
                         data.frame(before=bc_before, after=bc_after_all, forw=TRUE), 
                         data.frame(before=bc_after_compl, after=bc_before_compl, forw=FALSE),
                         data.frame(before=bc_after_compl, after=bc_before_compl_all, forw=FALSE),
                         data.frame(before=bc_after_compl_all, after=bc_before_compl, forw=FALSE))


## Main
if(file.exists(code.file)){file.remove(code.file)} ; file.create(code.file)
cat("#!/bin/bash\n\n",file=code.file)
cat("echo \"Name\tLineInFile\tBCsequence\tBCposition\tDirectionRead\" > ",out.file," \n", file=code.file, append=FALSE)

#2. Sequences with barcodes 
for(i in 1:nrow(bc_combinations)){
  bc_before <- as.character(bc_combinations$before[i])
  bc_after <- as.character(bc_combinations$after[i])
  forward  <- bc_combinations$forw[i]
  
  awk_in  <- paste0("awk '")
  awk_match <- paste0("NR%4==2 && match($0,/", bc_before, "([ACGT]{", bc_min,",",bc_max,"})", bc_after,"/)")
  if(forward){
    awk_print <- paste0(" {print a \"\t\" NR \"\t\" substr($0,RSTART+",l,",RLENGTH-",2*l,") \"\t\" RSTART+",l," \"\t+\" } {a=$0}' ",inFile, ">> ", out.file)
  }else{
    awk_print <- paste0(" {print a \"\t\" NR \"\t\" substr($0,RSTART+",l,",RLENGTH-",2*l,") \"\t\" RSTART+",l," \"\t-\" } {a=$0}' ",inFile, ">> ", out.file)
  }
  y_awk   <- paste0(awk_in, awk_match, awk_print)
  cat(y_awk,"\n", file=code.file, append=TRUE)
}

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
```

Then run bash file.
```{bash, eval=F}
cd 3_Analysis/KTLib/
chmod +x FindBarcodes.sh
./FindBarcodes.sh
```

Filter barcodes that appear more than 3 times
```{r LIBfilter BC>3, eval=F}
tic <- proc.time()
barcodes_table  <- read.table(paste0(path_analysis_lib,"KTLib_1700_2100_barcodes.txt"),header=T, sep="\t")
barcodes_counts <- barcodes_table %>% group_by(BCsequence) %>% summarise(counts=n()) %>% filter(counts>3)
barcodesID      <- unique(barcodes_counts$BCsequence); n_bc <- length(barcodesID)
barcodes_table_fourcounts <- barcodes_table %>% 
    filter(BCsequence %in% barcodesID)%>%
    mutate(readID=str_sub(Name, 2, 37)) ;
nrow_bc <- nrow(barcodes_table_fourcounts)
write.table(barcodes_table_fourcounts,file=paste0(path_analysis_lib,"KTLib_1700_2100_barcodes_4reads.txt"))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

## Consensus for all barcode by all methods

Load data 
```{r, eval=FALSE}
barcodes_4reads <- read.table(file=paste0(path_analysis_lib,"KTLib_1700_2100_barcodes_4reads.txt")) %>%
    select(readID, BCsequence) %>% arrange(BCsequence)

barcodes_unique <- unique(barcodes_4reads$BCsequence)

```



Generate fasta and fastq files that we need
```{r create fast files, eval=FALSE}
tic <- proc.time()
dir.create( paste0(path_analysis_lib, "fastq_per_barcode"))
dir.create( paste0(path_analysis_lib, "fasta_per_barcode"))

KTLib_Q15_seqs <- readQualityScaledDNAStringSet(paste0(path_analysis_lib,"KTLib_1700_2100_Q15.fastq"))
names(KTLib_Q15_seqs) <- str_sub(names(KTLib_Q15_seqs) , 1, 36)
for(s in 1:nrow(barcodes_4reads)){
    seq_name <- barcodes_4reads$readID[s]
    bc_name <- barcodes_4reads$BCsequence[s]
    file_name_q <- paste0(path_analysis_lib, "fastq_per_barcode/", bc_name,".fastq")
    file_name_a <- paste0(path_analysis_lib, "fasta_per_barcode/", bc_name,".fasta")
    # which(names(KTLib_Q15_seqs)==seq_name)
    writeQualityScaledXStringSet(KTLib_Q15_seqs[seq_name], filepath = file_name_q, append = T)
    writeXStringSet(KTLib_Q15_seqs[seq_name], filepath = file_name_a, append = T)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```


Load corrected sequences
```{r LIBconsensus load seqs, eval=F}

tic <- proc.time()
seqs_single_ktlib <- readQualityScaledDNAStringSet(paste0(path_analysis_lib,"/KTLib_1700_2100_Q15_SINGLe.fastq"))
aux_seqs <- vapply(as.character(seqs_single_ktlib),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single_ktlib),"NumericList")
aux_pos <- lapply(aux_seqs, seq)
aux_names <- c(vapply(names(seqs_single_ktlib),rep, each=1662))
data_single_ktlib <- data.frame(nucleotide = unlist(aux_seqs),
                        p_SINGLe=unlist(aux_quals),
                        position=unlist(aux_pos),
                        readID=aux_names) 
rownames(data_single_ktlib)<-NULL

seqs_guppy_ktlib <- readQualityScaledDNAStringSet(paste0(path_analysis_lib,"/KTLib_1700_2100_Q15_SINGLe_original.fastq"))
aux_seqs <- vapply(as.character(seqs_single_ktlib),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single_ktlib),"NumericList")
aux_pos <- lapply(aux_seqs, seq)
aux_names <- c(vapply(names(seqs_single_ktlib),rep, each=1662))
data_guppy_ktlib <- data.frame(nucleotide = unlist(aux_seqs),
                        p_Guppy=unlist(aux_quals),
                        position=unlist(aux_pos),
                        readID=aux_names)
rownames(data_guppy_ktlib)<-NULL


data_single_ktlib <- data_single_ktlib %>% 
    left_join(data_guppy_ktlib, by = c("nucleotide", "position", "readID"))%>%
    left_join(barcodes_4reads, by = "readID")%>%
    filter(!is.na(BCsequence))  %>%
    mutate(isWT = nucleotide==wildtype[position]) %>%
    mutate(p_SINGLe=ifelse(is.na(p_SINGLe), 1, p_SINGLe))%>%
    mutate(p_noweights=1)


toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

barcodes_unique <- unique(data_single_ktlib$BCsequence)
n_bc <- length(barcodes_unique)

# save.image(file='myEnvironment.RData')
```

```{r, eval=F}
# load('myEnvironment.RData')
```

By SINGLe (sorting homopolymers)
```{r LIBconsensus by single, eval=F}
tic <- proc.time()
#Compute consensus for each barcode using all sequences available and using the three available methods

cat("Barcode;Nreads;mutations_by_single;mutations_by_nanopore;mutations_no_weights\n", file=file_out_consensus, append=F)

pb <- txtProgressBar(0,n_bc,style=3)
counter<-0
wt_homopolymers <- detect_homopolymer_positions(wildtype)
wt_homopolymers <- wt_homopolymers[vapply(wt_homopolymers,length)>1]
pos_wt_homopolymers <- unlist(wt_homopolymers)

for( bc in barcodes_unique){
  bc_reads <- data_single_ktlib %>%
    filter(BCsequence==bc) 
  
  index_homopolymers <- which(bc_reads$position %in% pos_wt_homopolymers & bc_reads$nucleotide=="-")
  n_memory <- c()
  for(n in index_homopolymers){
    if(n %in% n_memory){next()}
    hp_pos <- bc_reads$position[n]
    hp_pos_ref <- wt_homopolymers[vapply(wt_homopolymers, function(x){hp_pos %in% x})][[1]]
    hp_vec_logical <- hp_pos_ref==hp_pos
    hp_vec_length <- length(hp_vec_logical)
    hp_vec <- rep(NA,hp_vec_length)
    ind_aux <- which(hp_vec_logical)

    hp_vec[1:ind_aux] <- rev(n+1-1:ind_aux)
    if(ind_aux<hp_vec_length){
      hp_vec[(ind_aux+1):hp_vec_length] <- n+c(1:(hp_vec_length-ind_aux))
    }
    n_memory <- hp_vec
    
    data_aux <- bc_reads[hp_vec,]
    if(all(data_aux$nucleotide=="-")){next()}
    
    pos <- data_aux$position
    nucl <- data_aux$nucleotide
    ind_order <- c(which(nucl!="-"),which(nucl=="-"))
    
    bc_reads$position[hp_vec] <- pos[order(ind_order)]
    
    if(length( which(bc_reads$position %in% pos_wt_homopolymers & bc_reads$nucleotide=="-"))<length(index_homopolymers)){
      cat("n=",n,"\n")
      stop()
    }
  }
  bc_reads <- bc_reads %>% arrange(readID,position)

  cons_single <- weighted_consensus(df = bc_reads %>%
                          select(nucleotide,p_SINGLe,position), 
                           cutoff_prob = 0)
  cons_nanopore <- weighted_consensus(df = bc_reads %>%
                          select(nucleotide,p_Guppy,position), 
                           cutoff_prob = 0)
  cons_noweight <- weighted_consensus(df = bc_reads %>%
                          select(nucleotide,p_noweights,position), 
                           cutoff_prob = 0)

  muts_single <- c(detect_mutations(biostr_to_char(cons_single),  wildtype))
  muts_nanopore <- c(detect_mutations( biostr_to_char(cons_nanopore), wildtype))
  muts_noweight <- c(detect_mutations( biostr_to_char(cons_noweight),  wildtype))

  n_single <- length(muts_single)
  n_nanopore <- length(muts_nanopore)
  n_noweight <- length(muts_noweight)
  
  if(n_single==0){n_single=1;muts_single="wildtype"}
  if(n_nanopore==0){n_nanopore=1;muts_nanopore="wildtype"}
  if(n_noweight==0){n_noweight=1;muts_noweight="wildtype"}
  n_tot = n_single+n_nanopore+n_noweight
      
  df <- data.frame(barcode = rep(bc,n_tot),nseqs=length(unique(bc_reads$readID)),
                  mutation=c(muts_single,muts_nanopore,muts_noweight),
                  method = c(rep("single", n_single),
                             rep("nanopore",n_nanopore),
                             rep("no_weights",n_noweight)))
  
  write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus)
  counter <- counter+1
  setTxtProgressBar(pb,counter)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

# save.image(file='myEnvironment.RData')

```


Prepare fast5 files for Nanopolish. Please indicate proper paths indicated in capital letters
```{r LIBprepare fast5, eval=F}
tic <- proc.time()
dir.create(paste0(path_analysis_lib,"/fast5files/") )# individual fast5 files will be stored there temporally 

split_fast5 <- paste0("multi_to_single_fast5 -i 2_NanoporeRawReads/LargeSet_fast5 -s 3_Analysis/KTLib/fast5files/")  #PATH_TO_FAST5_FILES location of nanopore reads
system2(split_fast5)

# Place them in folders according to barcode
single_files       <- dir(paste0(path_analysis_lib,"/fast5files/"), full.names = T, recursive = T)
single_files_name  <- dir(paste0(path_analysis_lib,"/fast5files/"), full.names = F, recursive = T)

single_files <- data.frame(file=single_files, readID=single_files_name) %>%
  mutate(readID = sub(readID, pattern=".fast5",replacement="")) %>%
  mutate(readID = sub(readID, pattern="[[:digit:]]+/", replacement=""))

barcodes_table_fourcounts <- barcodes_table %>% left_join(single_files, by="readID")
nrow_bc <- nrow(barcodes_table_fourcounts)
pb <- txtProgressBar(min = 0, max = nrow_bc, style = 3)
for(i in 1:nrow_bc){
  setTxtProgressBar(pb,i)
  seqid <- barcodes_table_fourcounts$readID[i]
  barcode <- barcodes_table_fourcounts$BCsequence[i]
  barcode_dir <- paste0(path_analysis_lib,"fast5files/",barcode)
  file <- barcodes_table_fourcounts$file[i]
  if(!dir.exists(barcode_dir)){ dir.create(barcode_dir)}
  command <- paste0("cp ", file, " ",barcode_dir)
  system2(command)
}

# Create unique fast5 file per barcode
pb <- txtProgressBar(min = 0, max = nrow_bc, style = 3)
for(bc in barcodesID){
  setTxtProgressBar(pb,bc)
  command <- paste0("single_to_multi_fast5 -i 3_Analysis/KTLib/fast5files/",bc," -s 3_Analysis/KTLib/fast5files_byBC/ -f ",bc)
  system2(command)
}

barcodes_table_fourcounts <- barcodes_table_fourcounts %>% select(-file)
rm(single_files,path_fast5,path_fast5_individual,path_fast5_individual_out)

### Create fastq file for interesting barcodes
dir.create(paste0(path_analysis_lib,"/fastq/"))
fastq_file <- readLines(paste0(path_analysis_lib,"/KTLib_1700_2100_Q15.fastq"))
n_fastq <- length(fastq_file)
for(i in seq(1,n_fastq, by=4)){
  ind <- grep(pattern = substr(fastq_file[i],2,37),x=barcodes_table_fourcounts$SeqID)
  if(length(ind)==0){next(i,"not found \n")}
  write(fastq_file[c(i,i+1,i+2,i+3)],
        file=paste0(path_analysis_lib,"/fastq/",barcodes_table_fourcounts$BCsequence[ind],".fastq"),
        append=T)
}


### Create fasta from fastq
fastq_files <- dir(paste0(path_analysis_lib,"/fastq/"), full.names = T, pattern="*.fastq")
fasta_files <- str_replace_all(fastq_files, "fastq", "fasta")
dir.create(paste0(path_analysis_lib,"/fasta/"))
commands <- paste0("seqtk seq -a ", fastq_files, " > ",fasta_files)
vapply(commands, system2)

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Compute using nanopolish and medaka. first generate bash file
```{r LIBconsensus others, eval=FALSE}

# Load data
tic <- proc.time()
#outputs
bash_file <- "ConsensusByMethods.sh"
results_medaka     <- paste0(path_analysis_lib,"ConsensusbyMedaka.txt")
nanopolish_results <- paste0(path_analysis_lib,"Consensus_by_Nanopolish.txt")

for(bc in barcodes_unique){
  #all filenames
  filename <- bc
  fasta      <- paste0(path_analysis_lib,"fasta/",bc,".fasta")
  fastq      <- paste0(path_analysis_lib,"fastq/",bc,".fastq")
  sam        <- paste0(path_analysis_lib,bc,".sam")
  sorted     <- paste0(path_analysis_lib,bc,".sorted.fasta")
  racon      <- paste0(path_analysis_lib,bc,".racon.fasta")
  medaka     <- paste0(path_analysis_lib,bc,"_medaka.fasta")
  nanopolish <- paste0(path_analysis_lib,bc,"_nanopolish.txt")
      
  ## Run minimap2
  line_minimap <- paste0("minimap2 -ax map-ont ", reference_file, " ", fastq," > ", sam)
  
  ## Run racon
  line_racon <- paste0("racon ",fastq," ",sam, " ",reference_file, " >> ", racon)
  line_remove_racon <- paste0("rm ", racon,".fai ",racon, ".mmi ",racon  )
  
  ## Run Medaka
  line_medaka <- paste0("medaka_consensus -i ",fastq," -d ",racon,"  -o ", medaka," -t 4 ")
  line_results_medaka <- paste0("echo '>",filename, "' >> ",results_medaka)
  line_results_medaka2 <- paste0("tail -n 1 ",filename,"_medaka.fasta/consensus.fasta", " >> ",results_medaka)
  line_remove_medaka <- paste0("rm -r ",filename,"_medaka.fasta/")
  
  ## Run nanopolish
  line_nanopolish_index  <- paste0("nanopolish index -d ",bc,"/ ",fastq)
  line_nanopolish_sort    <- paste0("samtools sort ",sam," -o ",sorted," -T temporal.tmp ")
  line_nanopolish_index2 <- paste0("samtools index ", sorted)
  line_nanopolish_variants <- paste0("nanopolish variants --consensus -o ", nanopolish, " -r ",fastq," -b ",sorted," -g ", reference_file)
  line_results_nanopolish <- paste0("awk '!/^#/ {print \"",filename,"\\t\" $0}' ",nanopolish, " >> ",nanopolish_results)
  line_remove_nanopolish <- paste0("rm ", fastq,".index* ", filename,".sorted* ", nanopolish, " ", sam)

  cat(line_minimap,
      line_racon,
      line_medaka,
      line_results_medaka,
      line_results_medaka2,
      line_nanopolish_index,
      line_nanopolish_sort,
      line_nanopolish_index2,
      line_nanopolish_variants,
      line_results_nanopolish,
      line_remove_racon,
      line_remove_medaka,
      line_remove_nanopolish,
      file=bash_file, sep="\n", append=T)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Then run it
```{bash LIBconsensus others bash, eval=F}
chmod +x ConsensusByMethods.sh
./ConsensusByMethods.sh
```


Align consensus sequences to reference
```{bash, eval=F}
minimap2 -ava-ont 1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyMedaka.fasta > ConsensusbyMedaka_aligned.sam
minimap2 -ava-ont 1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyRacon.fasta > ConsensusbyRacon_aligned.sam
```


Parse results into a table consensus_mismatches

```{r LIBconsensus parse, eval=F}
tic <- proc.time()
pos_initial <- 1
pos_final <- length(wildtype)

cigar_reference_counts   <- c("M","D","N","=","X")
cigar_query_counts <- c("M","I","S","=","X")

sam_data_medaka <- read.table(paste0(path_analysis_lib,"ConsensusbyMedaka_aligned.sam"), skip=2)
medaka_mismatches     <- data.frame(Barcode=NA,Position=NA,base.original=NA,base.mutated=NA)
for(i in 1:nrow(sam_data_medaka)){

  line_data       <- unlist(sam_data_medaka[i,])
  pos_ini_aliref  <-  as.numeric(line_data[4])
  cigar           <-  line_data[6]
  sequence        <-  strsplit(line_data[10],split="")[[1]]

  #Obtain CIGAR vector
  cigar.table      <- data.frame(n=as.numeric(strsplit(cigar, split="[MIDNSHP=X]")[[1]]),Op= strsplit(cigar, split="[0-9]+")[[1]][-1])
  cigar_vec        <- unlist(apply(cigar.table,1, function(x){rep(x[2],x[1])}))
  names(cigar_vec) <- NULL
  if(pos_ini_aliref>1){cigar_vec <- c(rep("D",pos_ini_aliref-1),cigar_vec)}
  
  #Data frame for the original sequence, and reference for the model (full)
  mismatches            <- data.frame(Barcode =rep(line_data[1], length(cigar_vec)),
                                      cigar=cigar_vec, 
                                      base.mutated=NA,pos_in_samalig=NA,Position=NA,base.original=NA)
  
  rows_with_query     <- mismatches$cigar%in%cigar_query_counts
  rows_with_reference <- mismatches$cigar%in%cigar_reference_counts
  
  mismatches$base.mutated[rows_with_query]  <- sequence     #Nucleotide
  mismatches$base.mutated[is.na(mismatches$base.mutated)] <- "-"

  mismatches$pos_in_samalig[rows_with_reference] <- pos_ini_aliref + c(0:(sum(rows_with_reference)-1))  #position - according to aligning reference
  
  mismatches$Position[rows_with_reference] <- 1:length(wildtype) #position - according to single reference
  mismatches$base.original[rows_with_reference]   <- wildtype

  index <- which(mismatches$base.mutated!=mismatches$base.original)
  
  mismatches <- mismatches[index,c("Barcode", "Position","base.original","base.mutated")]
  medaka_mismatches <- medaka_mismatches %>% rbind(mismatches) 

}
medaka_mismatches <- medaka_mismatches %>% filter(!is.na(Barcode))  %>% mutate(method="Medaka",mutation=paste0(base.original,Position,base.mutated))

sam_data_racon <- read.table("ConsensusbyRacon_aligned.sam", skip=2)
racon_mismatches     <- data.frame(Barcode=NA,Position=NA,base.original=NA,base.mutated=NA)
for(i in 1:nrow(sam_data_racon)){

  line_data       <-  unlist(sam_data_racon[i,])
  pos_ini_aliref  <-  as.numeric(line_data[4])
  cigar           <-  line_data[6]
  sequence        <-  strsplit(line_data[10],split="")[[1]]

  #Obtain CIGAR vector
  cigar.table      <- data.frame(n=as.numeric(strsplit(cigar, split="[MIDNSHP=X]")[[1]]),Op= strsplit(cigar, split="[0-9]+")[[1]][-1])
  cigar_vec        <- unlist(apply(cigar.table,1, function(x){rep(x[2],x[1])}))
  names(cigar_vec) <- NULL
  if(pos_ini_aliref>1){cigar_vec <- c(rep("D", pos_ini_aliref-1), cigar_vec)}
  
  #Data frame for the original sequence, and reference for the model (full)
  mismatches            <- data.frame(Barcode =rep(line_data[1], length(cigar_vec)),
                                      cigar=cigar_vec, 
                                      base.mutated=NA,pos_in_samalig=NA,Position=NA,base.original=NA)
  
  rows_with_query     <- mismatches$cigar%in%cigar_query_counts
  rows_with_reference <- mismatches$cigar%in%cigar_reference_counts
  
  mismatches$base.mutated[rows_with_query]  <- sequence     #Nucleotide
  mismatches$base.mutated[is.na(mismatches$base.mutated)] <- "-"
  
  mismatches$pos_in_samalig[rows_with_reference] <- pos_ini_aliref + c(0:(sum(rows_with_reference)-1))  #position - according to aligning reference
  
  mismatches$Position[rows_with_reference] <- 1:length(wildtype) #position - according to single reference
  mismatches$base.original[rows_with_reference]   <- wildtype

  index <- which(mismatches$base.mutated!=mismatches$base.original)
  
  mismatches <- mismatches[index,c("Barcode", "Position","base.original","base.mutated")]
  racon_mismatches <- racon_mismatches %>% rbind(mismatches) 

}
racon_mismatches <- racon_mismatches %>% filter(!is.na(Barcode)) %>% mutate(method="Racon", mutation=paste0(base.original,Position,base.mutated))

results_nanopolish <- readLines("ConsensusbyNanopolish.txt")
results_nanopolish <- do.call(rbind,vapply(results_nanopolish, strsplit, split="\t"))
colnames(results_nanopolish) <- rownames(results_nanopolish) <- NULL
results_nanopolish <- results_nanopolish[,c(1,3,5,6)]
colnames(results_nanopolish) <- c("Barcode","Position","base.original","base.mutated")
results_nanopolish <- as.data.frame(results_nanopolish)
results_nanopolish$Position <- as.numeric(results_nanopolish$Position)

results_nanopolish <- results_nanopolish %>% filter(!is.na(Barcode)) 

#remove insertions
ind_insertions <- nchar(results_nanopolish$base.mutated)>1
results_nanopolish$base.mutated[ind_insertions] <- vapply(results_nanopolish$base.mutated[ind_insertions] , substr,1,1)
results_nanopolish <- results_nanopolish %>% filter(base.mutated!=base.original)

#Split deletions
index_aux <- which(nchar(results_nanopolish$base.original)==2)
results_nanopolish$base.original[index_aux] <- substr(results_nanopolish$base.original[index_aux],2,2)
results_nanopolish$base.mutated[index_aux] <- "-"
results_nanopolish$Position[index_aux] <- as.numeric(results_nanopolish$Position[index_aux])+1

index_aux_3 <- which(nchar(results_nanopolish$base.original)==3)
aux_df_1 <- results_nanopolish[index_aux_3,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"

aux_df_2 <- results_nanopolish[index_aux_3,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"

results_nanopolish <- results_nanopolish[-index_aux_3,]
results_nanopolish <- results_nanopolish %>% rbind(aux_df_1) %>% rbind(aux_df_2)
rm(aux_df_1,aux_df_2)

index_aux_4 <- which(nchar(results_nanopolish$base.original)==4)
aux_df_1 <- results_nanopolish[index_aux_4,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"

aux_df_2 <- results_nanopolish[index_aux_4,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"

aux_df_3 <- results_nanopolish[index_aux_4,]
aux_df_3$Position <- as.numeric(aux_df_3$Position)+3
aux_df_3$base.original <- substr(aux_df_3$base.original, 4,4)
aux_df_3$base.mutated <- "-"

results_nanopolish <- results_nanopolish[-index_aux_3,]
results_nanopolish <- results_nanopolish %>% rbind(aux_df_1) %>% rbind(aux_df_2)%>% rbind(aux_df_3) %>% arrange(Barcode,Position) %>% mutate(method="Nanopolish", mutation=paste0(base.original,Position,base.mutated))


save(results_nanopolish, medaka_mismatches, racon_mismatches, file=paste0(path_consensus,"Mismatches_by_Method.Rdata"))


single_mismatches             <- read.table("ConsensusBySINGLE.txt", header=F,skip=1)
colnames(single_mismatches) <- colnames(single_mismatches_sortHPinVCS) <- c("Barcode","Nreads","mutation","method")

single_mismatches <- single_mismatches %>% 
  mutate(base.original = stringr::str_sub(mutation,1,1))     %>%
  mutate(base.mutated = stringr::str_sub(mutation,-1,-1))    %>%
  mutate(Position = stringr::str_sub(mutation,2,-2))         %>%
  select(Barcode,base.original,base.mutated,Position,method,mutation) 


KTLib_mutations <- rbind(single_mismatches,single_mismatches_sortHPinVCS, results_nanopolish, medaka_mismatches,racon_mismatches)
write.table(KTLib_mutations, file=file_consensus_KTLib)

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```


## Analyse convergence of 10 most frequent barcodes

Identify ten most frequent barcodes
```{r LIB10BCs, eval=F}
tic <- proc.time()
KTLib_topBC <- barcodes_4reads %>% 
    group_by(BCsequence) %>% 
    summarise(n=n()) %>%
    arrange(-n)%>%
    head(n=10)
KTLib_topBC_reads<- barcodes_4reads %>% 
    filter(BCsequence %in% KTLib_topBC$BCsequence) 
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```


Create a fastq file for each of them
```{r LIB10BCs create fastq, eval=F}
tic <- proc.time()
for(i in unique(KTLib_topBC$BCsequence)){
    reads_bc_i <- KTLib_topBC_reads %>% filter(BCsequence==i)
    sequences_bc_i <- seqs_single_ktlib [names(seqs_single_ktlib) %in% reads_bc_i$readID]
    writeQualityScaledXStringSet(sequences_bc_i, file=paste0(path_analysis_lib, i, ".fastq"))  
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Compute consensus on subsets of reads by SINGLE
```{r LIB10BCs consensus single, eval=F}
tic <- proc.time()
#Load corrected data
data_single_ktlib_topten <- data_single_ktlib %>%
    filter(Name %in% KTLib_topBC_reads$Name ) 

# # Load data
# reads_corrected <- read.table("3_PreprocessedData/SINGLed_data/LargeSet_F_corrected.txt", header=T)
# reads_corrected <- reads_corrected %>% filter(SeqID %in% KTLib_BC_reads$SeqID )
# reads_corrected_rev <- read.table("3_PreprocessedData/SINGLed_data/LargeSet_R_corrected.txt", header=T)
# reads_corrected_rev <- reads_corrected_rev %>% filter(SeqID %in% KTLib_BC_reads$SeqID )
# reads_corrected_all     <- rbind(reads_corrected,reads_corrected_rev)
#                          
# reads_corrected <- left_join(reads_corrected_all, KTLib_topBC_reads %>% select(SeqID,BCsequence) , by="SeqID")
# reads_corrected <- reads_corrected %>% filter(!is.na(reads_corrected$BCsequence))
# reads_corrected$p_right_priors_model[is.na(reads_corrected$p_right_priors_model)]<-1


#Compute consensus for each barcode using all sequences available and using the three available methods
cat("Barcode\tNreads\trepetition\tmutation\tmethod\n", file=file_out_consensus_topten, append=F)

barcodes_unique <- unique(data_single_ktlib_topten$BCsequence)
n_bc <- length(barcodes_unique)

for(bc in barcodes_unique){
    cat(bc, "\n")
  bc_reads <- data_single_ktlib_topten %>%
    filter(BCsequence==bc)
  
  #compute consensus on subsets
  bc_seqsID <- unique(bc_reads$readID)
  for( s in subsets){
    cat(s, "\t")
    counter<- 0
    pb <- txtProgressBar(0,n_repetitions,style=3)
    for (r in 1:n_repetitions) {
      counter <- counter+1
      setTxtProgressBar(pb,counter)
  
      subset_bc_reads <- bc_reads %>% 
          filter(readID %in% sample(bc_seqsID, s))
      
    #sort homopolymers
    index_homopolymers <- which(subset_bc_reads$position %in% pos_wt_homopolymers
                                & subset_bc_reads$nucleotide=="-")
    n_memory <- c()
    for(n in index_homopolymers){
        if(n %in% n_memory){next()}
        hp_pos <- subset_bc_reads$position[n]
        hp_pos_ref <- wt_homopolymers[vapply(wt_homopolymers, function(x){hp_pos %in% x})][[1]]
        hp_vec_logical <- hp_pos_ref==hp_pos
        hp_vec_length <- length(hp_vec_logical)
        hp_vec <- rep(NA,hp_vec_length)
        ind_aux <- which(hp_vec_logical)
        
        hp_vec[1:ind_aux] <- rev(n+1-1:ind_aux)
        if(ind_aux<hp_vec_length){
          hp_vec[(ind_aux+1):hp_vec_length] <- n+c(1:(hp_vec_length-ind_aux))
        }
        n_memory <- hp_vec
        
        data_aux <- subset_bc_reads[hp_vec,]
        if(all(data_aux$nucleotide=="-")){next()}
        
        pos <- data_aux$position
        nucl <- data_aux$nucleotide
        ind_order <- c(which(nucl!="-"),which(nucl=="-"))
        
        subset_bc_reads$position[hp_vec] <- pos[order(ind_order)]
        
        if(length( which(subset_bc_reads$position %in% pos_wt_homopolymers & subset_bc_reads$nucleotide=="-"))<length(index_homopolymers)){
          cat("n=",n,"\n")
          stop()
        }
    }
    subset_bc_reads <- subset_bc_reads %>% arrange(readID,position)
    
    cons_single <- weighted_consensus(df = subset_bc_reads %>%
                                          select(nucleotide,p_SINGLe,position), 
                                   cutoff_prob = 0)
    cons_guppy  <- weighted_consensus(df = subset_bc_reads %>% 
                                          select(nucleotide,p_Guppy,position),
                                   cutoff_prob = 0)
    cons_noweight <- weighted_consensus(df = subset_bc_reads %>%
                                            select(nucleotide,p_noweights,position),
                                    cutoff_prob = 0)
    muts_single   <- detect_mutations(biostr_to_char(cons_single), wildtype)
    muts_guppy    <- detect_mutations(biostr_to_char(cons_guppy), wildtype)
    muts_noweight <- detect_mutations(biostr_to_char(cons_noweight), wildtype)

      n_single <- length(muts_single)
      n_guppy <- length(muts_guppy)
      n_noweight <- length(muts_noweight)
      n_tot <- n_single+n_guppy+n_noweight
      if(n_tot==0){next()}
      df <- data.frame(barcode = rep(bc,n_tot),nseqs=s,repetition=r,
                      mutation=c(muts_single,muts_guppy,muts_noweight),
                      method = c(rep("SINGLe", n_single),
                                 rep("Guppy",n_guppy), 
                                 rep("No weights",n_noweight)))
      write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus_topten)
    }
  }

}

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
```

Compute consensus on subsets of reads by Nanopolish and Medaka
```{r LIB10BCs consensus others, eval=F}
tic <- proc.time()
dir.create(paste0(path_analysis_lib,"/subsets/"))
file.copy(reference_file, paste0(path_analysis_lib,"/subsets/"))
## Make fastq files with subsets
for(bc in unique(KTLib_topBC_reads$BCsequence)){
  
  fastq_file <- readLines(paste0(path_analysis_lib, bc,".fastq"))
  name_lines <- grep(pattern="^@",x=fastq_file)

  for(ns in subsets){
    if(ns> length(name_lines)){next()}
    for(r in 1:n_repetitions){
      fastq_subset_out    <- paste0(path_analysis_lib,"subsets/",bc,"_",ns,"_",r,".fastq")

      ## Make file of subset of sequences
      choose_lines <- sample(name_lines, size= ns)
      choose_lines_all <- sort(c(choose_lines,choose_lines+1,choose_lines+2,choose_lines+3))
      writeLines(text = fastq_file[choose_lines_all], con = file(fastq_subset_out))
    }
  }
}

in_files <- 
    dir(pattern=".fastq", path=paste0(path_analysis_lib,"/subsets/"))

## .sh for minimap2
for(i in seq_along(in_files)){
  fastq_file <- in_files[i]
  sam_file   <- sub(pattern=".fastq", replacement = ".sam", x=fastq_file)
  line_minimap <- paste0("minimap2 -ax map-ont KTrefORF_1662.fasta ",fastq_file," > ", sam_file)
  cat(line_minimap,file=paste0(path_analysis_lib,"/subsets/Minimap_allBC.sh"), sep="\n", append=i!=1)
}

# .sh for racon
results_racon      <- "../ConsensusbyRacon_subsets.txt"
for(i in seq_along(in_files)){
  fastq_file <- in_files[i]
  sam_file   <- sub(pattern=".fastq", replacement = ".sam", x=fastq_file)
  racon      <- sub(pattern=".fastq", replacement = ".racon.fasta", x=fastq_file)
  filename     <- sub(pattern=".fastq", replacement = "", x=fastq_file)
  
  ## Run racon
  line_racon <- paste0("racon ",fastq_file," ",sam_file, " KTrefORF_1662.fasta >> ", racon)
  line_results_racon <- paste0("echo '>",filename, "' >> ",results_racon)
  line_results_racon2 <- paste0("tail -n 1 ",racon, " >> ",results_racon)

  cat(line_racon,line_results_racon,line_results_racon2,file=paste0(path_analysis_lib,"/subsets/Racon_allBC.sh"), sep="\n", append= i!=1) 
}

# .sh for Medaka
for(i in seq_along(in_files)){
  fastq_file <- in_files[i]
  racon      <- sub(pattern=".fastq", replacement = ".racon.fasta", x=fastq_file)
  filename   <- sub(pattern=".fastq", replacement = "", x=fastq_file)
  medaka     <- sub(pattern=".fastq", replacement = "_medaka.fasta", x=fastq_file)
  results_medaka <- paste0("Consensus_",filename,".fasta")

  ## Run Medaka
  line_medaka <- paste0("medaka_consensus -i ",fastq_file," -d ",racon,"  -o ", medaka," -t 4 ")
  line_results_medaka <- paste0("echo '>",filename, "' >> ",results_medaka)
  line_results_medaka2 <- paste0("tail -n 1 ",filename,"_medaka.fasta/consensus.fasta", " >> ",results_medaka)
  line_remove_medaka <- paste0("rm -r ",medaka)
  
  cat(line_medaka,line_results_medaka,line_results_medaka2,line_remove_medaka,file=paste0(path_analysis_lib,"/subsets/Medaka_",filename,".sh"), sep="\n", append=F) 
}

# .sh for nanopolish
for(i in seq_along(in_files)){
  fastq_file         <- in_files[i]
  sam_file           <- sub(pattern=".fastq", replacement = ".sam", x=fastq_file)
  nanopolish_file    <- sub(pattern=".fastq", replacement = "_nanopolish.txt", x=fastq_file)
  filename           <- sub(pattern=".fastq", replacement = "", x=fastq_file)
  nanopolish_results <- paste0(filename,"_ConsensusbyNanopolish_subsets.txt")
  bc <- strsplit(filename, split="_")[[1]][1]
  
  fast5_dir <- paste0("../fast5/individualfiles/",bc)

  ## lines nanopolish
  line_nanopolish_index  <- paste0("nanopolish index -d  ../fast5/ subsets/",fastq_file)
  line_nanpolish_sort    <- paste0("samtools sort subsets/",sam_file," -o ",filename,".sorted.fasta -T temporal.tmp ")
  line_nanopolish_index2 <- paste0("samtools index ", filename,".sorted.fasta")
  line_nanopolish_variants <- paste0("nanopolish variants --consensus -o ", nanopolish_file, " -r BCtopten/",fastq_file," -b ",filename,".sorted.fasta -g ", reference_file)
  line_results_nanopolish <- paste0("awk '!/^#/ {print \"",filename,"\\t\" $0}' ",nanopolish_file, " >> ",nanopolish_results)
  line_remove_nanopolish <- paste0("rm ", filename,"_nanopolish.txt ", filename,".sorted* ")

  cat("cd ../ \n", 
      line_nanopolish_index,
      line_nanpolish_sort,
      line_nanopolish_index2,
      line_nanopolish_variants,
      line_results_nanopolish,
      line_remove_nanopolish,
      file=paste0("subsets/Nanopolish_",filename,".sh"), sep="\n", append=F)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

```{bash, eval=F}
cd 3_Analysis/KTLib/subsets
chmod +x Minimap_allBC.sh
./Minimap_allBC.sh

chmod +x Racon_allBC.sh
./Racon_allBC.sh

chmod +x Medaka*
.Medaka*  

chmod +x Nanopolish*
./Nanopolish*  

cat *_ConsensusbyRacon.txt > ConsensusbyRacon_subsets.txt
cat *_ConsensusbyMedaka.txt > ConsensusbyMedaka_subsets.txt
cat *_ConsensusbyNanopolish.txt > ConsensusbyNanopolish_subsets.txt

minimap2 -ava-ont ../../1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyRacon_subsets.fasta > ConsensusbyRacon_subsets_aligned.sam
minimap2 -ava-ont ../../1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyMedaka_subsets.fasta > ConsensusbyMedaka_subsets_aligned.sam
```

Identify mutations and save table
```{r}
file_consensus_top10_inLIB <- paste0(path_analysis_lib,"Convergence_10top_mismatches.txt" )

```

```{r LIB10BCs consensus to muts, eval=FALSE}
tic <- proc.time()
pos_initial <- 1
pos_final <- length(wildtype)

cigar_reference_counts   <- c("M","D","N","=","X")
cigar_query_counts <- c("M","I","S","=","X")

sam_data_medaka <- read.table("subsets/ConsensusbyMedaka_subsets_aligned.sam", skip=2)
medaka_mismatches_subsets     <- data.frame(Barcode=NA,Position=NA,base.original=NA,base.mutated=NA)
for(i in 1:nrow(sam_data_medaka)){
  line_data       <- unlist(sam_data_medaka[i,])
  pos_ini_aliref  <-  as.numeric(line_data[4])
  cigar           <-  line_data[6]
  sequence        <-  strsplit(line_data[10],split="")[[1]]

  #Obtain CIGAR vector
  cigar.table      <- data.frame(n=as.numeric(strsplit(cigar, split="[MIDNSHP=X]")[[1]]),Op= strsplit(cigar, split="[0-9]+")[[1]][-1])
  cigar_vec        <- unlist(apply(cigar.table,1, function(x){rep(x[2],x[1])}))
  names(cigar_vec) <- NULL
  if(pos_ini_aliref>1){cigar_vec <- c(rep("D",pos_ini_aliref-1),cigar_vec)}
  
  #Data frame for the original sequence, and reference for the model (full)
  mismatches            <- data.frame(Barcode =rep(line_data[1], length(cigar_vec)),
                                      cigar=cigar_vec, 
                                      base.mutated=NA,pos_in_samalig=NA,Position=NA,base.original=NA)
  
  rows_with_query     <- mismatches$cigar%in%cigar_query_counts
  rows_with_reference <- mismatches$cigar%in%cigar_reference_counts
  
  mismatches$base.mutated[rows_with_query]  <- sequence     #Nucleotide
  mismatches$base.mutated[is.na(mismatches$base.mutated)] <- "-"

  mismatches$pos_in_samalig[rows_with_reference] <- pos_ini_aliref + c(0:(sum(rows_with_reference)-1))  #position - according to aligning reference
  
  mismatches$Position[rows_with_reference] <- 1:length(wildtype) #position - according to single reference
  mismatches$base.original[rows_with_reference]   <- wildtype

  index <- which(mismatches$base.mutated!=mismatches$base.original)
  
  mismatches <- mismatches[index,c("Barcode", "Position","base.original","base.mutated")]
  medaka_mismatches_subsets <- medaka_mismatches_subsets %>% rbind(mismatches) 
}

medaka_mismatches_subsets <- medaka_mismatches_subsets %>% filter(!is.na(Barcode))
medaka_aux <- do.call(rbind,vapply(medaka_mismatches_subsets[,1],strsplit,split="_"))
medaka_mismatches_subsets <- medaka_mismatches_subsets %>% mutate(Barcode = medaka_aux[,1], Nreads=as.numeric(medaka_aux[,2]), repetition=as.numeric(medaka_aux[,3]))
medaka_mismatches_subsets <- medaka_mismatches_subsets  %>% mutate(Method="Medaka", homopolymers=NA, mutation=paste0(base.original,Position,base.mutated)) %>% dplyr::rename(repetition=repetitions) 

nanopolish_mismatches_subsets = data.frame(barcode=NA,nseqs=NA,repetition=NA,position=NA,base.original=NA,base.mutated=NA)
results_nanopolish <- system2(paste0("awk \'/^[^#]/ \' BCtopten/nanopolish_consensus/*"), intern = T)
results_nanopolish <- do.call(rbind,vapply(results_nanopolish, strsplit, split="\t"))
results_nanopolish <- results_nanopolish[,c(1,3,5,6)]
nanopolish_aux <- do.call(rbind,vapply(results_nanopolish[,1],strsplit,split="_"))
results_nanopolish <- cbind(nanopolish_aux,results_nanopolish[,-1])
colnames(results_nanopolish) <- c("Barcode","Nreads","repetition", "Position","base.original","base.mutated" )
rownames(results_nanopolish) <- NULL
nanopolish_mismatches_subsets <- data.frame(Barcode = results_nanopolish[,1], 
                                 Nreads=as.numeric(results_nanopolish[,2]),
                                 repetition = as.numeric(results_nanopolish[,3]),
                                 Position = as.numeric(results_nanopolish[,4]), 
                                 base.original=results_nanopolish[,5],
                                 base.mutated = results_nanopolish[,6])

#remove insertions
ind_insertions <- nchar(nanopolish_mismatches_subsets$base.mutated)>1
nanopolish_mismatches_subsets$base.mutated[ind_insertions] <- vapply(nanopolish_mismatches_subsets$base.mutated[ind_insertions] , substr,1,1)
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets %>% filter(base.mutated!=base.original)

#Split deletions
index_aux <- which(nchar(nanopolish_mismatches_subsets$base.original)==2)
nanopolish_mismatches_subsets$base.original[index_aux] <- substr(nanopolish_mismatches_subsets$base.original[index_aux],2,2)
nanopolish_mismatches_subsets$base.mutated[index_aux] <- "-"
nanopolish_mismatches_subsets$Position[index_aux] <- as.numeric(nanopolish_mismatches_subsets$Position[index_aux])+1

index_aux_3 <- which(nchar(nanopolish_mismatches_subsets$base.original)==3)
aux_df_1 <- nanopolish_mismatches_subsets[index_aux_3,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"

aux_df_2 <- nanopolish_mismatches_subsets[index_aux_3,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"

nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets[-index_aux_3,]
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets %>% rbind(aux_df_1) %>% rbind(aux_df_2)
rm(aux_df_1,aux_df_2)

index_aux_4 <- which(nchar(nanopolish_mismatches_subsets$base.original)==4)
aux_df_1 <- nanopolish_mismatches_subsets[index_aux_4,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"

aux_df_2 <- nanopolish_mismatches_subsets[index_aux_4,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"

aux_df_3 <- nanopolish_mismatches_subsets[index_aux_4,]
aux_df_3$Position <- as.numeric(aux_df_3$Position)+3
aux_df_3$base.original <- substr(aux_df_3$base.original, 4,4)
aux_df_3$base.mutated <- "-"

nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets[-index_aux_3,]
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets %>% rbind(aux_df_1) %>% rbind(aux_df_2)%>% rbind(aux_df_3) %>% arrange(Barcode,Nreads,repetition,Position) %>% mutate(Method="Nanopolish",homopolymers=NA, mutation=paste0(base.original,Position,base.mutated))


single_mismatches <- read.table(paste0(path_analysis_lib,"ConsensusSINGLE_toptenBC.txt"), header=F,row.names=NULL, skip=1)
colnames(single_mismatches)<-c("Barcode","Nreads","repetition","mutation","Method")
single_mismatches <- single_mismatches %>%
  mutate(base.original = str_sub(mutation,1,1), 
         base.mutated=str_sub(mutation,-1,-1),
         Position=str_sub(mutation,start=2,end=-2)) %>%
  mutate(Position = as.numeric(Position), homopolymers="original") #%>% select(-mutation)


write.table(rbind(nanopolish_mismatches_subsets %>% mutate(Method="Nanopolish"),
                  medaka_mismatches_subsets %>% mutate(Method="Medaka"),
                  single_mismatches, file=file_consensus_top10_inLIB))
            
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Compute true mutants from all reads according to each method
```{r, eval=FALSE}
tic <- proc.time()

KTLib_topBC_true_mutations <- read.table("3_PreprocessedData/Fig3_LargeSet_Consensus.txt", header=T) %>%
    dplyr::rename(Method=method) %>%
    mutate(Method = recode(Method,!!!methods_capital))%>% 
    filter(Barcode %in% KTLib_topBC$BCsequence) %>%
    filter(Method!="Racon") %>%
    filter((Method=="SINGLe" & homopolymers=="sorted_in_VCS") |
               is.na(homopolymers) | 
               (Method!="SINGLe" & homopolymers=="original") )%>%
    mutate(mutation=paste0(base.original,Position,base.mutated)) %>%
    select(Barcode,Method,mutation)
rownames(KTLib_topBC_true_mutations) <- NULL

#manually replace equivalent mutations
KTLib_topBC_true_mutations <- KTLib_topBC_true_mutations %>%
    mutate(mutation=ifelse(mutation=="G677-","G676-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G419-","G416-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G63-","G62-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G429-","G428-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="A1545-","A1543-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G244-","G243-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G263-","G261-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G677-","G676-", mutation))

aux_nanopolish <- data.frame(Barcode= KTLib_topBC_true_mutations$Barcode[KTLib_topBC_true_mutations$mutation=="GGAC1306G"], 
                             Method="Nanopolish", 
                             mutation=c("G1307-","A1308-", "C1309-"))
    
KTLib_topBC_true_mutations <- KTLib_topBC_true_mutations %>% 
    filter(mutation!="GGAC1306G") %>%
    rbind(aux_nanopolish)

compare_mutations <- reshape2::dcast(KTLib_topBC_true_mutations %>%
                                         select(Barcode, Method, mutation),
                                     Barcode+mutation~Method)

KTLib_topBC_true_mutants <- KTLib_topBC_true_mutations %>%
    group_by(Barcode,Method) %>% 
    summarise(n_mutations=n(),mutant = paste0(mutation, collapse = " ")) %>%
    ungroup()

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

Compute success rate for consensus calling by method, as a function of reads used
```{r LIB10BCs success rate, eval=FALSE}
tic <- proc.time()
# consensus_by_method
KTLib_topBC_subsets_mutations <- read.table("3_PreprocessedData/Fig3_ConvergenceConsensus_top10BC.txt", header=T) %>%
    filter(Method!="racon") %>%
    filter((Method=="single" & homopolymers=="sorted_for_VCS") |
               is.na(homopolymers) | 
               (Method!="single" & homopolymers=="original") )  %>%
    select(-homopolymers,-variant) %>%
    select(-base.original,-Position,-base.mutated)%>%
    mutate(Method = recode(Method,!!!methods_capital))

KTLib_topBC_subsets_mutations <- KTLib_topBC_subsets_mutations %>%
    mutate(mutation=ifelse(mutation=="G677-","G676-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G419-","G416-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G63-","G62-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G429-","G428-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="A1545-","A1543-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G244-","G243-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G263-","G261-", mutation)) %>%
    mutate(mutation=ifelse(mutation=="G677-","G676-", mutation))

indx <- which(KTLib_topBC_subsets_mutations$mutation=="GGAC1306G")
aux_nanopolish <- data.frame(Barcode= rep(KTLib_topBC_subsets_mutations$Barcode[indx],3),
                            Nreads = rep(KTLib_topBC_subsets_mutations$Nreads[indx], each=3),
                            repetition = rep(KTLib_topBC_subsets_mutations$repetition[indx], each=3),
                             Method="Nanopolish",
                             mutation=rep(c("G1307-","A1308-", "C1309-"), lenght.out=length(indx)))
    
KTLib_topBC_subsets_mutations <- KTLib_topBC_subsets_mutations %>% 
    filter(mutation!="GGAC1306G") %>%
    rbind(aux_nanopolish)

KTLib_topBC_subsets_mutant <- KTLib_topBC_subsets_mutations %>%
    select(Barcode,Nreads,repetition,Method,mutation) %>%
    group_by(Barcode,Nreads,repetition,Method) %>%
    summarise(n_mutations=n(),mutant=paste(mutation, collapse = " "))

KTLib_topBC_subsets_success_rate <- left_join(KTLib_topBC_subsets_mutant, 
                                         KTLib_topBC_true_mutants,
                                         by=c("Barcode","Method"),
                                         suffix=c(".subset",".true")) %>%
    mutate(equal = mutant.subset==mutant.true ) %>% 
    filter(!is.na(n_mutations.true)) %>%
    group_by(Barcode,Nreads,Method) %>% 
    dplyr::summarise(success_rate= sum(equal)/n()) 

ggplot(KTLib_topBC_subsets_success_rate,aes(x=Nreads,y=success_rate,col=Method) )+geom_point()+geom_line()+
  facet_wrap(~Barcode,ncol=2)+scale_color_manual(values=colors_vector_capital[-6])+
  ylab("Success rate")  

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```



### Figure 3A
```{r Fig3A, fig.width=3, fig.height=3, eval=FALSE}
fig_3a <- ggplot(KTLib_topBC_subsets_success_rate %>% 
                    filter((Method =="SINGLe"|Method=="Medaka") & Barcode=="TCAGTTATTACGTCCTGCGGACAGTAAGGTCCGAG"),
                aes(x=Nreads,y=success_rate,col=Method) )+
    geom_point()+
    geom_line()+
    scale_color_manual(values=colors_vector_capital[1:2])+
    scale_y_continuous(limits=c(0,1),breaks = c(0,.25,.5,.75,1))+
    geom_hline(aes(yintercept=0.9), col=grey(0.5),lty="dashed")+
    theme_plot+
  theme(legend.position = c(.9,0.4),
        legend.justification = c("right","top"),
        legend.title = element_blank(),
        legend.spacing.y = unit(0,"line"))+
    labs(tag = "A",x="Reads", y="Success rate")

fig_3a
```

### Figure S7: top 10 bc convergence

```{r FigS7, fig.width=7, fig.height=10, eval=F}
Fig_Sup7<- ggplot(KTLib_topBC_subsets_success_rate %>% filter(Method!= "Racon"),
                  aes(x=Nreads,y=success_rate,col=Method) )+
    geom_point()+geom_line()+
  facet_wrap(~Barcode,ncol=2)+scale_color_manual(values=colors_vector_capital[-6])+
  scale_y_continuous(breaks=c(0,.5,1))+
  geom_hline(aes(yintercept=0.9), col=grey(.5),linetype="dashed")+
  ylab("Success rate")  +theme_plot+
  theme(plot.margin = unit(c(0,5.5,5.5,5.5),"points"),
        strip.background = element_rect(fill= "white"),
        strip.text = element_text(size=8), 
        legend.position = "bottom")

Fig_Sup7
ggsave(Fig_Sup7, file=paste0(path_save_figures,"FigureS7.png"), 
       dpi = 'print', width = 17, height=18,units="cm")

```


### Load consensus and count reads per barcode
```{r, eval=F}
tic <- proc.time()
KTLib_mutations <- read.table("3_PreprocessedData/Fig3_LargeSet_Consensus.txt") %>% 
    filter(method!="Racon") %>% 
    filter((method=="single" & homopolymers=="sorted_in_VCS") |
               is.na(homopolymers) | 
               (method!="single" & homopolymers=="original") )

aux_nanopolish <- KTLib_mutations %>%
        filter(nchar(base.original)>1) %>%
    mutate(base.mutated="-")
aux_nanopolish <- (aux_nanopolish %>% mutate(base.original=str_sub(base.original, 2,2)) ) %>%
    rbind(aux_nanopolish%>% mutate(base.original=str_sub(base.original, 3,3))) %>% 
    rbind(aux_nanopolish%>% mutate(base.original=str_sub(base.original, 4,4)))

KTLib_mutations <- KTLib_mutations %>%
    filter(!nchar(base.original)>1) %>%
    rbind(aux_nanopolish)



KTLib_nreads_per_barcode <- barcodes_4reads %>% 
    group_by(BCsequence) %>% dplyr::summarise(Nreads=n()) %>% 
    mutate(Barcode = BCsequence) %>% select(-BCsequence) %>% select(Barcode, Nreads) %>% 
    arrange(Barcode)

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Compute the actual number of mutations in the library by SINGLe

```{r, fig.width=3, fig.height=3, eval=F}
tic <- proc.time()
n_mutations <- KTLib_mutations %>%
    filter(method=="single") %>%
    group_by(Barcode) %>%
    dplyr::summarise(n_mutations = n()) %>% 
    left_join(KTLib_nreads_per_barcode) %>% filter(Nreads>5)

plot_nmutations <- ggplot(n_mutations, aes(x=n_mutations))+
    geom_histogram()+
    theme_plot+
    labs(x="Number of mutations", y="Counts")
mean_n_mutations <- mean(n_mutations$n_mutations)
median_n_mutations <- median(n_mutations$n_mutations)
sd_n_mutations <- sd(n_mutations$n_mutations)

mutation_rate_measured <- mean_n_mutations/1662*1000
 sd_n_mutations/1662*1000
 
 
plot_nmutations
cat("Mean N mutations: ", round(mean_n_mutations,2), 
 "\nMedian N mutations: ", round(median_n_mutations,2),
 "\nSD N mutations: ", round(sd_n_mutations,2), 
 "\nMutation rate measured: ", round(mutation_rate_measured,2), "muts/kb")

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

## Compare number of mutations by SINGLe and Medaka
```{r, eval=F}
tic <- proc.time()
consensus_mismatches_ms <- KTLib_mutations %>% 
    filter((method=="Medaka" )| (method=="single")) 

nmismatches_ms <- consensus_mismatches_ms %>%
  group_by(Barcode, method) %>%
  summarise(n_mismatches=n())



nmismatches_ms_difference <- left_join(nmismatches_ms %>% filter(method=="Medaka" ), 
                                       nmismatches_ms %>% filter(method=="single" ), 
                                       by="Barcode", suffix=c("",".single")) %>%
  mutate(difference = n_mismatches.single-n_mismatches) %>%
  left_join(KTLib_nreads_per_barcode, by = "Barcode") %>% 
  ungroup() %>% 
  mutate(Reads =  ifelse(Nreads<=5,"5 or less reads", ifelse(Nreads<=10,"6 to 10 reads","10 or more reads"))) %>%
  mutate(Reads=factor(Reads, levels=c("5 or less reads","6 to 10 reads","10 or more reads")))%>%
  select(-Nreads) %>%  ungroup()  %>% filter(!is.na(n_mismatches.single))


nmismatches_ms_difference_hist <- nmismatches_ms_difference %>% 
    group_by(method,difference,Reads) %>% 
    summarise(counts=n()) 

nmismatches_ms_difference_hist <- nmismatches_ms_difference %>% 
    ungroup()%>%
    mutate(difference = ifelse(difference< -6, -6, difference))%>%
    group_by(method,difference,Reads) %>% 
    summarise(counts=n()) %>%
     group_by(Reads) %>% mutate(perc = counts/sum(counts)*100) 

table_mutations_compare <- nmismatches_ms_difference_hist %>% 
    mutate(diff = ifelse(difference<0,-1,ifelse(difference==0,0,1)))%>% 
    group_by(Reads, diff) %>% summarise(n=sum(counts)) %>%
    group_by(Reads) %>% 
    mutate(perc=n/sum(n)*100)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Figure 3B
```{r Fig 3B, fig.width=5, fig.height=5, eval=F}
fig_3b <-  ggplot(table_mutations_compare , aes(x=diff,y=perc))+
    geom_bar(position = position_dodge2(preserve="single"),stat="identity")+
    xlab("Method that reports more mutations")+ylab("Percentage")+
    facet_wrap(~Reads, ncol=1, strip.position = "right", labeller = label_wrap_gen(width=12))+
    scale_y_continuous(limits = c(0,110), breaks=c(50,100))+
    scale_x_continuous(breaks=seq(-1,1,by=1), labels=c("Medaka", "Both match", "SINGLe"))+
    theme(legend.position = c(0,1),
            legend.justification = c("left","top"),
            legend.title = element_text(size = rel(1)),
            legend.text = element_text(size=rel(1)),
            legend.spacing.y = unit(0.5,"line"),
            legend.key.size = unit(2,"pt"),
            plot.margin = unit(c(0,5.5,5.5,5.5),"points"), 
          strip.background = element_rect(fill=grey(.95)))+
    labs(tag = "B")+
    geom_text(aes(y=perc,label=paste0(round(perc),"%"),vjust=-.2))

fig_3b
```


## Distribution of mutations and skewness 
```{r skewness, eval=F}
tic <- proc.time()
results_for_entropy<- KTLib_mutations %>%  
  filter(base.original!="wildtype" ) %>%
  filter(Position >0 & Position <=1662) %>%
  left_join(KTLib_nreads_per_barcode, by="Barcode") %>%
  mutate(group_Nreads_barcode = ifelse(Nreads>10, "> 10 reads", 
                                      ifelse(Nreads>5,"6 to 10 reads","5 or less reads"))) %>%
  mutate(group_Nreads_barcode=factor(group_Nreads_barcode, levels=c("5 or less reads","6 to 10 reads", "> 10 reads")))%>%
  group_by(method,Position,group_Nreads_barcode, homopolymers) %>%
  dplyr::summarise(n=n())%>% ungroup() %>%
    mutate(Method = recode(method,!!!setNames(c("Nanopolish","Medaka","SINGLe","Guppy","No weights"),c("nanopolish","medaka","single","nanopore","no_weights")))) %>%
    select(-method)


skewness <- results_for_entropy%>%
  group_by(Method, group_Nreads_barcode) %>%
  dplyr::summarise(skewness=skewness(n))

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Figure 3C
```{r Fig3C, fig.width=5, fig.height=5, eval=F}
fig_3c <- ggplot(results_for_entropy %>% filter(Method=="Medaka" | Method == "SINGLe"),
                 aes(x=Position, y=n, col=Method))+
    geom_point(size=0.5)+
    scale_color_manual(values=colors_vector_capital)+
    ylab("Counts")+
    facet_grid(Method~group_Nreads_barcode)+
    geom_text(
        data    = skewness  %>% filter(Method=="Medaka" | Method == "SINGLe"),
        mapping = aes(x = -Inf, y = -Inf, label = paste0("sk=", round(skewness,2))),
        hjust   = 0,
        vjust   = 1, 
        y=rep(140,6), x=rep(80,6), 
        col="black"
    )+
    theme_plot+
    theme(legend.position = "none",plot.margin = unit(c(1,5.5,5.5,5.5),"points"),)+
    labs(tag = "C")

fig_3c
```

### Figure S8

```{r FigS8, eval=F}
Figure_S8 <- ggplot(results_for_entropy ,
                 aes(x=Position, y=n, col=Method))+
  geom_point(size=0.5)+scale_color_manual(values=colors_vector_capital[-6])+geom_line()+
  ylab("Counts")+
  facet_grid(Method~group_Nreads_barcode)+
   geom_text(
  data    = skewness,
  mapping = aes(x = -Inf, y = -Inf, label = paste0("sk=", round(skewness,2))),
  hjust   = 0,
  vjust   = 1, 
  y=rep(200,nrow(skewness)), x=rep(80,nrow(skewness)), 
  col="black"
)+theme_plot+  theme(legend.position = "none",plot.margin = unit(c(0,5.5,5.5,5.5),"points"),strip.background = element_rect(fill="white"),strip.text = element_text(size=8))

Figure_S8
ggsave(Figure_S8, filename="6_Figures/FigureS8.png",dpi = 'print', width = 17, units = "cm", height=20)

```

## Figure 3

```{r Fig3, eval=F}
Figure3 <- ( fig_3a | fig_3b) / fig_3c 
Figure3
ggsave(Figure3, file=paste0(path_save_figures,"Figure3.png"),
       dpi = 'print', width = 16, height=16,units="cm")

```






## How is the mutation rate predicted compared to the one reported by Agilent?
### Figure S9
```{r FigS9, eval=F}
kit_rates <- data.frame(
    Mutation=c("Deletion", "Ts AG TC" ,"Ts GA CT"  ,"Tv AT TA","Tv AC TG" ,"Tv GC CG", "Tv GT CA"),
    true_perc=c(4.8,17.5,25.5,28.5,4.7,4.1,14.1))
aux <- KTLib_mutations %>% 
    left_join(KTLib_nreads_per_barcode,by="Barcode") %>%
    mutate(reads_number = case_when(
        Nreads<6 ~ "4 or 5 reads", 
        Nreads <11 ~ "6 to 10 reads", 
        Nreads>10 ~ "more than 10 reads") )%>%
    mutate(reads_number= factor(reads_number, levels=c("4 or 5 reads", "6 to 10 reads", "more than 10 reads")))%>%
    mutate(Mutation=case_when(
        base.mutated=="-"~ "Deletion",
        base.original=="A" & base.mutated=="G" ~ "Ts AG TC",
        base.original=="T" & base.mutated=="C" ~ "Ts AG TC",
        base.original=="G" & base.mutated=="A" ~ "Ts GA CT",
        base.original=="C" & base.mutated=="T" ~ "Ts GA CT",
        base.original=="A" & base.mutated=="T" ~ "Tv AT TA",
        base.original=="T" & base.mutated=="A" ~ "Tv AT TA",
        base.original=="A" & base.mutated=="C" ~ "Tv AC TG",
        base.original=="T" & base.mutated=="G" ~ "Tv AC TG",
        base.original=="G" & base.mutated=="C" ~ "Tv GC CG",
        base.original=="C" & base.mutated=="G" ~ "Tv GC CG",
        base.original=="G" & base.mutated=="T" ~ "Tv GT CA",
        base.original=="C" & base.mutated=="A" ~ "Tv GT CA"
    )) %>%
    group_by(method,reads_number,Mutation)%>%
    summarise(counts = n()) %>%
    group_by(method,reads_number)  %>%
    mutate(counts=counts/sum(counts)*100) %>%
    left_join(kit_rates, by="Mutation")%>%
    ungroup()%>%
    mutate(Method = recode(method,!!!setNames(c("Nanopolish","Medaka","SINGLe","Guppy","No weights"),c("nanopolish","medaka","single","nanopore","no_weights")))) %>%
    select(-method)

plot_mr_xy <- ggplot(aux, aes(col=Method, x=true_perc, y=counts, shape=Mutation,label=Mutation))+
    geom_point()+
    geom_abline(aes(slope=1,intercept=0), lty="dashed", col=grey(.5))+
    facet_wrap(~reads_number,ncol=1)+xlim(c(0,30))+scale_shape_manual(values=1:7)+theme_plot+
    xlab("By manufacturer (%)")+
    ylab("Observed (%)")+theme_plot+ggtitle("Mutation rate")


plot_mr_cor <- ggplot(aux %>% group_by(Method, reads_number) %>% 
           summarise(cor=cor(counts, true_perc)) %>% 
           ungroup(), aes(x=reads_number, y=cor, col=Method))+
    geom_point()+
    geom_line(aes(x=as.numeric(reads_number)))+
    ylab("Correlation")+xlab("")+
    # ggtitle("Compare observed mutation rate vs. manufacturer's report")+
    scale_y_continuous(breaks=c(0.8,0.9,1), limits = c(0.8,1))+theme_plot+
    theme(axis.text.x = element_text(angle=90,hjust = 1,vjust=0.5))


FigureS9 <- grid.arrange(plot_mr_xy , plot_mr_cor+theme(legend.position = "none"), ncol=2) 
             
ggsave(FigureS9, file=paste0(path_save_figures,"FigureSMutRate.png"), 
       dpi = 'print', width = 16, height=12,units="cm")

```



# How many reads needed to fit?

```{r , eval=F}
tic <- proc.time()
a <- load("/media/rocio/Data10TB/Work/Bibliography/RondelezLab/2022_SINGLe/GigaScience_submission/Espada_et_al/5_Revision/01_Subsets/01_KlenTaq/barcode01_allseqs_countspnq.Rdata")
full_fits <- read.table("/media/rocio/Data10TB/Work/Bibliography/RondelezLab/2022_SINGLe/GigaScience_submission/Espada_et_al/5_Revision/01_Subsets/01_KlenTaq/barcode01_allreads_fits.txt", header=T)

full_fits <- full_fits %>%
    mutate(rqs = 1-deviance/null_deviance)

error_rate <- data_mut %>%
    group_by(pos,strand,nucleotide) %>%
    dplyr::summarise(error_rate = sum(count)/(sum(count)+sum(counts.wt)),
                     counts_mut = sum(count))

full_fits<- full_fits %>%
    left_join(error_rate, by=c("strand","pos","nucleotide"))

## Full fits mutated positions
mutationstested <- kt_true_mutations %>% 
    select(position, nucleotide) %>%
    dplyr::rename(pos=position)
full_fits_mut <- left_join(mutationstested, full_fits)
hex <- scales::hue_pal()(4)
names(hex)<- c(303,331,879,1316)

full_fits_mut_examples <- full_fits_mut %>%
    filter( (pos==1316 & nucleotide=="A" & strand=="+")|
                (pos==879  & nucleotide=="T" & strand=="-")|
                (pos==303  & nucleotide=="A" & strand=="-")|
                (pos==331  & nucleotide=="T" & strand=="+"))

plot_counts_RSQ <- ggplot(full_fits,aes(x=rqs,y=counts_mut))+
    geom_point(col=rgb(0,0,0,0.1), stroke=0, size=2)+
    scale_y_log10()+
    theme_plot+
    geom_point(data=full_fits_mut_examples,
               aes(x=rqs,y=counts_mut, col=as.factor(pos)),
               size=2,shape="square")+
    scale_color_manual(values=hex)+
    labs(x="Quality (Q)", y="Counts(C)")+theme(legend.position = "none")

xlim <- 1:50
plots <- list()
for(i in 1:nrow(full_fits_mut_examples)){
    nuc <- full_fits_mut_examples$nuc[i]
    pot <- full_fits_mut_examples$pos[i]
    str  <- full_fits_mut_examples$strand[i]
    aux_df <- data_mut %>%
        dplyr::filter(pos==pot & nucleotide ==nuc & strand==str)%>%
        dplyr::select(strand,QUAL,count, counts.wt,counts.scaled,counts.wt.scaled) %>%
        dplyr::mutate(tot.scaled=counts.scaled+counts.wt.scaled) %>%
        dplyr::mutate(proportion.wt.scaled=counts.wt.scaled/tot.scaled) %>%
        arrange(QUAL)
    qval  <- aux_df$QUAL
    yvals <- aux_df$proportion.wt.scaled
    no <- !is.na(yvals)
    yvals <-yvals[no]
    qval <- qval[no]

    fitt  <- stats::glm(yvals~ qval,family = "binomial")

    prediction <- data.frame(QUAL=xlim, fitted = predict(fitt,list(qval=xlim), type = "response"))
    # lines(qval,predict(fitt,list(qval=qval),type="response"), col=2, lwd=2)

    qq<- format((full_fits_mut_examples %>%
                     filter(pos==pot & nucleotide ==nuc & strand==str))$rqs,scientific=F, digits=2)
    cc<- (full_fits_mut_examples %>%
              filter(pos==pot & nucleotide ==nuc & strand==str))$counts

    plots[[i]] <- ggplot(aux_df, aes(x=QUAL,y=proportion.wt.scaled))+
        geom_point()+
   
        geom_line(data=prediction, aes(x=QUAL, y=fitted),
                  col=hex[as.character(pot)])+
        ggtitle(paste(pot,nuc,str),
                subtitle=paste("C:", cc, " QF:",qq))+
        labs(x="Qscore", y="ncorrect")+
        xlim(range(xlim))+
        scale_y_continuous(breaks=c(0,1), limits = c(0,1))+
        theme(plot.title = element_text(margin = margin(0,0,0,0)),
              plot.subtitle =  element_text(margin = margin(0,0,0,0)))

}


toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Figure S10
```{r FigS10, eval=F}
FigureS10 <- grid.arrange(plot_counts_RSQ , plots[[1]], plots[[4]], plots[[3]], plots[[2]] ,
                       layout_matrix=matrix(c(1,1,2,3,4,5), byrow = F,nrow=2),
                       widths=c(2.5,1,1))
ggsave(FigureS10, file=paste0(path_save_figures,"FigureExtra.png"), dpi = 'print',
       height = 8.7, width=20,units="cm")

FigureS10
```

# Correlation neighbouring Qscore
```{r Qscore correlation, eval=F}
tic <- proc.time()
path_11 <- "/media/rocio/Data10TB/minIONreads/KlenTaq/2019_11_KT/KT7_fullpipeline/KT7/BasecallSuperior/pass/out/Demultiplex/minimap2/SINGLE/"

data_11 <- read.table(paste0(path_11, "barcode05.for_corrected.txt"), header=T)
data_11 <- data_11
error_lines <- which(!data_11$isWT&
                         !(data_11$position==867 & data_11$nucleotide=="G")&
                         !(data_11$position==1120 & data_11$nucleotide=="A") &
                         !(data_11$position==1214 & data_11$nucleotide=="T") &
                         !(data_11$position==1316 & data_11$nucleotide=="A") &
                         !(data_11$nucleotide=="-"))
correct_lines <- which(data_11$isWT &
                       !(data_11$position==867 & data_11$nucleotide=="G")&
                       !(data_11$position==1120 & data_11$nucleotide=="A") &
                       !(data_11$position==1214 & data_11$nucleotide=="T") &
                       !(data_11$position==1316 & data_11$nucleotide=="A") &
                       !(data_11$nucleotide=="-"))


rand_pos <- sample(1:nrow(data_11), 200)
df11 <- rbind(
    data.frame(type="Errors",
               difference = abs(data_11$original_quality[error_lines]-data_11$original_quality[error_lines+1]), wt = data_11$isWT[error_lines+1]),
    data.frame(type="Correct",
               difference = abs(data_11$original_quality[correct_lines]-data_11$original_quality[correct_lines+1]), wt = data_11$isWT[correct_lines+1])) %>%
    rbind( data.frame(type="Random",
               difference = abs(data_11$original_quality[rand_pos]-data_11$original_quality[sample(1:nrow(data_11), 200)]), wt = data_11$isWT[rand_pos])
    ) %>%
    mutate(difference = ifelse(difference>20, 20, difference))

toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)

```

### Figure S3
```{r FigS3, fig.width=7, fig.height=3, eval=F}
FigureS3 <- ggplot(df11, aes(x=difference))+
    geom_histogram(binwidth = 1,aes(y=..density.., text=..density..))+
    xlab("Qscore difference with next nucleotide")+
    ylab("Counts")+
    labs(tag="C")+
    facet_wrap(~type, scales = "free")

ggsave(FigureS3, file=paste0(path_save_figures,"FigureS3.png"), dpi = 'print',
       height = 5, width=15,units="cm")

FigureS3
```



Depricated nextpolish:
```{r, eval=F}
# create_run_cfg <- function(run_cfg_file, ref_fasta, workdir){
#     cat("[General]\n", file=run_cfg_file, append=FALSE)
#     cat('job_type = local\n', file=run_cfg_file, append=TRUE)
#     cat('job_prefix = nextPolish\n', file=run_cfg_file, append=TRUE)
#     cat('task = best\n', file=run_cfg_file, append=TRUE)
#     cat('rewrite = yes\n', file=run_cfg_file, append=TRUE)
#     cat('rerun = 3\n', file=run_cfg_file, append=TRUE)
#     cat('parallel_jobs = 2\n', file=run_cfg_file, append=TRUE)
#     cat('multithread_jobs = 4\n', file=run_cfg_file, append=TRUE)
#     cat('genome =', ref_fasta,' #genome file\n', file=run_cfg_file, append=TRUE)
#     cat('genome_size = auto\n', file=run_cfg_file, append=TRUE)
#     cat('workdir =',workdir, "\n",file=run_cfg_file, append=TRUE, sep="")
#     cat('   polish_options = -p {multithread_jobs}\n', file=run_cfg_file, append=TRUE)
# 
#     cat('[lgs_option]\n', file=run_cfg_file, append=TRUE)
#     cat('lgs_fofn =lgs.fofn\n', file=run_cfg_file, append=TRUE)
#     cat('lgs_options = -min_read_len 1k -max_depth 100\n', file=run_cfg_file, append=TRUE)
#     cat('lgs_minimap2_options = -x map-ont\n', file=run_cfg_file, append=TRUE)
#     return(0)
# }
# 
# 
# path0 <- "/home/respada/22nextpolish/bc06/"
# setwd(path0)
# run_folder  <- "RunFolder/"
# save_folder <- ""
# data_folder <- ""
# 
# for(bc in kt7_barcodes){
# 
#     #Load fastq file with all reads
#     all_sequences <- readLines(paste0(path_analysis_lib7, bc,".fastq"))
#     name_lines <- grep(pattern="^@",x=all_sequences)
#     save_file <- paste0(save_folder,bc, "_ConsensusByNextPolish.fasta")
# 
#     for(ns in subsets){
#         cat("\t",ns,"\n")
#         if(ns> length(name_lines)){next()}
#         for(r in 1:n_repetitions){
#             cat("----------------",ns, r, "\n")
# 
#             #Create file with subset of sequences
#             subset_name = paste0(bc,"_s",ns, "_r",r)
#             filename_subset <- paste0(run_folder,subset_name,".fastq")
#             choose_lines <- sample(name_lines, size= ns)
#             choose_lines_all <- sort(c(choose_lines,choose_lines+1,choose_lines+2,choose_lines+3))
#             writeLines(text = all_sequences[choose_lines_all], con = file(filename_subset))
# 
#             ## Run nextpolish
#             system2(paste0("echo '", subset_name,".fastq' >", run_folder, "lgs.fofn"))
#             create_run_cfg(run_cfg_file=paste0(run_folder, "run.cfg"),
#                            ref_fasta=reference_file,
#                            workdir=paste0(subset_name,"/"))
#             system2(paste0("cp ", reference_file, " ",run_folder, reference_file ))
#             system2(paste0("cd ",run_folder,"; nextPolish run.cfg"))
#             system2(paste0("sed -i '1s/.*/>",subset_name,"'/ ", run_folder, subset_name, "/genome.nextpolish.fasta "))
#             system2(paste0("cat ",run_folder, subset_name, "/genome.nextpolish.fasta >>", save_file))
#             system2(paste0("rm -r ", run_folder, subset_name))
#             system2(paste0("cd ",run_folder,";rm lgs.fofn  run.cfg"))
#             
#         }
#     }
# }
```

Alternative to fill deletions faster:
```{r, eval=F}


### OPTION
t0 <- proc.time()
a <- str_locate_all(reads_aligned, "-+")
for(i in seq(a)){
    if(nrow(a[[i]])==0){next()}
    pos_to_replace <- IRanges(a[[i]][,"start"], a[[i]][,"end"])
    pos_to_average_start <- IRanges(a[[i]][,"start"]-1,a[[i]][,"start"]-1)
    pos_to_average_end <- IRanges(a[[i]][,"end"]+1,a[[i]][,"end"]+1)

    if(start(pos_to_average_start)[1]==0){
            start(pos_to_average_start)[1] <- end(pos_to_average_start)[1] <- start(pos_to_average_end)[1] 
    }
    n <- length(pos_to_average_start)
    pos_max <- width(reads_aligned)[i]
    if(start(pos_to_average_end)[n]>pos_max){
        start(pos_to_average_end)[n] <-  start(pos_to_average_start)[n]      
        end(pos_to_average_end)[n] <- start(pos_to_average_start)[n]   
    }
    
    before <- extractAt(scores_aligned[[i]], at = pos_to_average_start)
    before <- unlist(as(PhredQuality(before),"NumericList"))
    after <- extractAt(scores_aligned[[i]], at = pos_to_average_end)
    after <- unlist(as(PhredQuality(after),"NumericList"))
    both <- cbind(before,after)
    if(gaps_weights=="mean"){
        replace_qscore <- apply(both,1, mean)
    }else if(gaps_weights=="minimum"){
        replace_qscore <- apply(both,1, min)
    }
    replace_qscore <- lapply(replace_qscore, function(x){as(x,"PhredQuality")})
    replace_qscore_v <- vapply(seq(replace_qscore), function(x){paste0(rep(replace_qscore[[x]], times=width(pos_to_replace)[x]), collapse="")})
    # print(scores_aligned[[i]])
    scores_aligned[i] <- replaceAt(scores_aligned[i],at = pos_to_replace, value = replace_qscore_v)
    # print(scores_aligned[[i]])
}
t1 <- proc.time()

t1-t0
  


### ORIGINAL
if(verbose){message("Assign values to deletions\n")}
if(verbose){pb=utils::txtProgressBar(min =0,max=length(reads_aligned),style = 3)}
t2 <- proc.time()

for(i in seq(length(reads_aligned))){
    # if(verbose){utils::setTxtProgressBar(pb,i)}
    ## REPLACE DELETION SCORES
    del_positions <- BiocGenerics::start(Biostrings::vmatchPattern("-",reads_aligned[i])[[1]])
    ndel <- length(del_positions)
    
    if(ndel==0){next()}
    nr <- width(reads_aligned)[i]
    
    #If they are at the beginning, I replace by the first Qscore
    if(del_positions[1]==1){
        n_del_start=1
        if(length(del_positions)==1){
            n_del_start <- 1
        }else{
            condition = del_positions[n_del_start+1]==del_positions[n_del_start]+1
            while(condition){
                n_del_start <- n_del_start+1
                condition1 = is.na(del_positions[n_del_start+1])
                if(condition1){
                    condition=FALSE
                }else{
                    condition = del_positions[n_del_start+1]==del_positions[n_del_start]+1
                }
            }
        }
        scores_aligned[i] <- Biostrings::replaceAt(scores_aligned[i],at=IRanges(1,n_del_start),as.character(subseq(scores_aligned[i], 1, n_del_start)))
    }else{n_del_start=NA}
    
    #If they are at the end, I replace by the last Qscore
    if(del_positions[ndel]==nr){
        #detect which are the values on the end of the string
        Breaks <- c(0, which(diff(del_positions) != 1), length(del_positions))
        Breaks_ini <- Breaks[length(Breaks)-1]+1
        Breaks_end <- Breaks[length(Breaks)]
        del_pos_ini <- del_positions[Breaks_ini]
        del_pos_end <- del_positions[Breaks_end]
        replacement <- paste0(rep(as.character(subseq(scores_aligned[i], del_pos_ini-1, del_pos_ini-1)), Breaks_end-Breaks_ini+1),collapse="")
        scores_aligned[i] <-Biostrings::replaceAt(scores_aligned[i],
                                                  at=IRanges(del_pos_ini,del_pos_end),
                                                  replacement)
        n_del_end <- length(Breaks_ini:Breaks_end)
    }else{n_del_end=NA}
    
    if(!is.na(n_del_end)){del_positions <- del_positions[-seq(ndel-n_del_end+1,ndel)]}
    if(!is.na(n_del_start)){del_positions <- del_positions[-seq(n_del_start)]}
    if(length(del_positions)==0){next()}
    ndel <- length(del_positions)
    for(j in seq_along(del_positions)){
        jmin <- j
        while(j < ndel && del_positions[j+1]==del_positions[j]+1){ j <- j+1 }
        jmax<-j
        
        q_upstr <- subseq(scores_aligned[i], del_positions[jmin]-1, del_positions[jmin]-1)
        q_dwstr <- subseq(scores_aligned[i], del_positions[jmax]+1, del_positions[jmax]+1)
        
        if(gaps_weights=="mean"){
            prob_upstr <- as(q_upstr,"NumericList")[[1]]
            prob_dwstr  <- as(q_dwstr,"NumericList")[[1]]
            prob <- mean(prob_dwstr,prob_upstr)
        }else if(gaps_weights=="minimum"){
            prob_upstr <- as(q_upstr,"NumericList")[[1]]
            prob_dwstr  <- as(q_dwstr,"NumericList")[[1]]
            prob <- min(prob_dwstr,prob_upstr)
        }
        
        qscore_new <- as(prob,"PhredQuality")
        qscore_new_vec <- paste0(rep(as.character(qscore_new), jmax-jmin+1), collapse="")
        scores_aligned[i] <- Biostrings::replaceAt(scores_aligned[i],at=IRanges(del_positions[jmin],del_positions[jmax]),qscore_new_vec)
    }
    
}

t3 <- proc.time()

t3-t2



```



# Session info {.unnumbered}

```{r sessionInfo, echo=FALSE}
sessionInfo()
```