## ----setup, echo=FALSE, results="hide", warning=FALSE, eval=TRUE-------------- suppressPackageStartupMessages({library(fraq)}) ## ----eval=FALSE--------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # BiocManager::install("fraq") ## ----walkthrough-setup, eval=TRUE, message=FALSE, warning=FALSE--------------- set.seed(314156) example_dir <- file.path(tempdir(), "fraq_function_examples") dir.create(example_dir, showWarnings = FALSE, recursive = TRUE) R1 <- file.path(example_dir, "example_R1.fastq") R2 <- file.path(example_dir, "example_R2.fastq") generate_random_fastq(c(R1, R2), n_reads = 2000, read_length = 150) example_reads <- c(R1, R2) ## ----fraq_summary, eval=TRUE-------------------------------------------------- library(fraq) # summarize quality metrics qc <- fraq_summary(c(R1, R2)) ## ----fig-qc-quicklook, eval=TRUE, fig.cap="Quick QC overview from `fraq_summary()`.", fig.width=8, fig.asp=0.6, warning=FALSE, message=FALSE, echo=FALSE, fig.path="plots/"---- # quick-look QC plots op <- par(no.readonly = FALSE) par(mfrow = c(2, 2), mar = c(4, 4, 2.5, 1)) insert <- qc$insert_size if (!is.null(insert) && nrow(insert) > 0) { barplot( insert$count, names.arg = insert$insert_size, col = "#38bdf8", border = NA, xlab = "Insert size", ylab = "Count", main = "Insert-size distribution" ) } else { len <- qc$length_distribution_R1 barplot( len$count, names.arg = len$length, col = "#38bdf8", border = NA, xlab = "Read length", ylab = "Count", main = "Length distribution" ) } qual <- qc$per_base_quality_R1 plot(qual$position, qual$mean_q, type = "l", lwd = 2, col = "#6366f1", xlab = "Cycle", ylab = "Mean Phred score", main = "Mean quality by cycle") grid(col = "#e5e7eb", lty = "dotted") base_content <- qc$per_base_content_R1 palette <- c( A = "#22c55e", C = "#0ea5e9", G = "#f97316", T = "#ef4444", N = "#94a3b8" ) plot(NA, xlim = range(base_content$position), ylim = c(0, 1), xlab = "Cycle", ylab = "Fraction", main = "Base composition") present_bases <- intersect(names(palette), unique(base_content$base)) for (b in present_bases) { dat <- base_content[base_content$base == b, , drop = FALSE] dat <- dat[order(dat$position), ] lines(dat$position, dat$fraction, col = palette[[b]], lwd = 2) } if (length(present_bases)) { legend("topright", legend = present_bases, col = palette[present_bases], lwd = 2, bty = "n", cex = 0.9) } base_totals <- aggregate(count ~ base, base_content, sum) cols <- palette[base_totals$base] cols[is.na(cols)] <- "#94a3b8" barplot( base_totals$count, names.arg = base_totals$base, col = cols, border = NA, xlab = "Base", ylab = "Total count", main = "Base totals" ) par(op) ## ----walkthrough-filtering, eval=TRUE, message=FALSE, warning=FALSE----------- filter_dir <- file.path(example_dir, "filtering") dir.create(filter_dir, showWarnings = FALSE) downsampled_reads <- file.path( filter_dir, c("example_R1_ds.fastq.gz", "example_R2_ds.fastq.gz") ) fraq_downsample(example_reads, downsampled_reads, amount = 0.50, nthreads = 2L) quality_reads <- file.path( filter_dir, c("example_R1_q20.fastq.gz", "example_R2_q20.fastq.gz") ) fraq_quality_filter( input = downsampled_reads, output = quality_reads, min_mean_quality = 22, max_low_q_bases = 6L, low_q_threshold = 18L, nthreads = 2L ) filtered_stats <- fraq_summary(quality_reads, nthreads = 2L)$basic_stats_R1 filtered_stats ## ----walkthrough-splitting-setup, eval=TRUE, message=FALSE, warning=FALSE----- split_dir <- file.path(example_dir, "splitting") if (dir.exists(split_dir)) unlink(split_dir, recursive = TRUE) dir.create(split_dir, showWarnings = FALSE) barcode_set <- c("ACGTAA", "TTGGCC") demux_patterns <- file.path( split_dir, c("R1_{barcode}.fastq.gz", "R2_{barcode}.fastq.gz") ) ## ----walkthrough-demux, eval=TRUE, message=FALSE, warning=FALSE--------------- fraq_demux( input = example_reads, output_format = demux_patterns, barcodes = barcode_set, max_distance = 1L, nthreads = 2L ) basename(sort( list.files(split_dir, pattern = "R1_.*\\.fastq.gz$", full.names = TRUE) )) ## ----walkthrough-chunk, eval=TRUE, message=FALSE, warning=FALSE--------------- chunk_prefix <- file.path(split_dir, "chunk_demo") fraq_chunk( input = example_reads[1], output_prefix = chunk_prefix, output_suffix = "gz", chunk_size = 200, nthreads = 2L ) basename(sort( list.files( split_dir, pattern = "chunk_demo_chunk.+\\.fastq.gz$", full.names = TRUE ) )) ## ----walkthrough-barcodes, eval=TRUE, message=FALSE, warning=FALSE------------ fraq_count_barcodes( input = example_reads, barcodes = barcode_set, max_distance = 0L, allow_revcomp = FALSE, nthreads = 2L ) ## ----walkthrough-modification, eval=TRUE, message=FALSE, warning=FALSE-------- mod_dir <- file.path(example_dir, "modification") if (dir.exists(mod_dir)) unlink(mod_dir, recursive = TRUE) dir.create(mod_dir, showWarnings = FALSE) ## ----walkthrough-modification-convert, eval=TRUE, message=FALSE, warning=FALSE---- converted_fastq <- file.path( mod_dir, c("example_R1.fastq.zst", "example_R2.fastq.zst") ) fraq_convert(example_reads, converted_fastq, nthreads = 2L) ## ----walkthrough-modification-concat, eval=TRUE, message=FALSE, warning=FALSE---- concatenated <- file.path(mod_dir, "example_all.fastq.gz") fraq_concat(converted_fastq, concatenated, nthreads = 2L) ## ----walkthrough-modification-merge, eval=TRUE, message=FALSE, warning=FALSE---- merged_reads <- file.path(mod_dir, "example_merged.fastq") unmerged_reads <- file.path( mod_dir, c("example_unmerged_R1.fastq", "example_unmerged_R2.fastq") ) fraq_merge_pairs( input = example_reads, output_merged = merged_reads, output_unmerged = unmerged_reads, min_overlap = 20L, max_mismatch_rate = 0.05, nthreads = 2L ) ## ----walkthrough-modification-trim, eval=TRUE, message=FALSE, warning=FALSE---- trimmed_reads <- file.path( mod_dir, c("example_trimmed_R1.fastq", "example_trimmed_R2.fastq") ) fraq_trim_adapters( input = example_reads, output = trimmed_reads, adapters = "ACTAC", max_distance = 1L, filter_untrimmed = FALSE, nthreads = 2L ) ## ----rcpp_example, eval=FALSE------------------------------------------------- # input <- c("sample_R1.fastq.gz", "sample_R2.fastq.gz") # output <- c("filtered_R1.fastq.zst", "filtered_R2.fastq.zst") # fraq_gc_filter(input, output, gc_min = 0.30, gc_max = 0.70) ## ----fig-streaming-example, eval=FALSE---------------------------------------- # library(fraq) # generate_random_fastq("R1.fastq") # generate_random_fastq("R2.fastq") # fraq_downsample(input=c("R1.fastq", "R2.fastq"), # output=c("ds_R1.fastq.fifo", "ds_R2.fastq.fifo"), # amount = 0.25, nthreads = 5L) ## ----fraq_options, eval=TRUE-------------------------------------------------- fraq_options("blocksize") # current block size (default: 65535 reads) fraq_options("blocksize", 16384L) # shrink batches when running small tests fraq_options("zstd_compress_level", 6L) fraq_options("gzip_compress_level", 4L) ## ----fig-session-info, eval=TRUE---------------------------------------------- sessionInfo()