% -*- mode: noweb; noweb-default-code-mode: R-mode; -*-
%\VignetteIndexEntry{hapFabia: Manual for the R package}
%\VignetteDepends{hapFabia}
%\VignettePackage{hapFabia}
%\VignetteKeywords{hapFabia,models,multivariate,cluster,hplot,file,genetics,haplotype,identity by descent,bicluster,next generation sequencing,genotype,single nucleotide polymorphism,single nucleotide variation,rare variants,rare SNPs, rare SNVs,rare haplotypes,short haplotypes}
%setwd("c:/sepp/work/hapfabia/hapFabia_10/hapFabia/vignettes")
%Sweave("hapFabia.Rnw")
%tools::texi2dvi("hapFabia.tex",pdf=TRUE)
%Stangle("hapFabia.Rnw")
%source("hapFabia.R")


\documentclass[article]{bioinf}

\usepackage{amsmath,amssymb}
\usepackage{bm}
\usepackage{natbib}
\usepackage{hyperref}
\usepackage{booktabs}
\usepackage{afterpage}
\usepackage{float}

\hypersetup{colorlinks=false,
   pdfborder=0 0 0,
   pdftitle={hapFabia: Identification of very short segments of identity by
  descent (IBD)
  characterized by rare variants in large sequencing data},
   pdfauthor={Sepp Hochreiter}}

\renewcommand{\dblfloatpagefraction}{0.99}
\renewcommand{\topfraction}{0.99}
\renewcommand{\bottomfraction}{0.99}
\renewcommand{\textfraction}{0.01}
\setcounter{dbltopnumber}{2}
\setcounter{bottomnumber}{2}


\def\Up{\bm{\Upsilon}}
\def\pp{\bm{\Psi}}
\def\epp{\bm{\epsilon}}
\def\Ncal{\mathcal{N}}
\def\X{\bm{X}}
\def\x{\bm{x}}
\def\xii{\bm{\xi}}
\def\Xii{\bm{\Xi}}
\def\oo{\mathrm{old}}
\def\nn{\mathrm{new}}
\def\sign{\mathrm{sign}}

\def\La{\bm{\Lambda}}
\def\la{\bm{\lambda}}
\def\z{\bm{z}}
\def\Z{\bm{Z}}
\def\Rb{\mathbb{R}}
\def\E{\mathbf{\mathrm{E}}}

\newcommand{\R}{\textrm{R} }
\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rpackage}[1]{{\texttt{#1}}}




\title{hapFabia: Identification of very short segments of identity by
  descent (IBD)
  characterized by rare variants in large sequencing data\\ \textit{---
    Manual for the R package ---}}

\author{Sepp Hochreiter}
\affiliation{Institute of Bioinformatics, Johannes Kepler University
Linz\\Altenberger Str. 69, 4040 Linz, Austria\\
\email{hochreit@bioinf.jku.at}}

\usepackage[noae]{Sweave}


\SweaveOpts{eps=FALSE}

\begin{document}
<<echo=FALSE,results=hide>>=
options(width=60)
set.seed(0)
library(hapFabia)
hapFabiaVer<-packageDescription("hapFabia")$Version
@
%$

\newcommand{\hapFabiaVer}{\Sexpr{hapFabiaVer}}

\manualtitlepage[Version \hapFabiaVer, \today]

\newlength{\auxparskip}
\setlength{\auxparskip}{\parskip}
\setlength{\parskip}{0pt}
\tableofcontents
\clearpage
\setlength{\parskip}{\auxparskip}


\section{Introduction}

This package \Rpackage{hapFabia} provides software
for the method HapFABIA which identifies
short identity by descent (IBD) segments
that are tagged by rare variants in
large sequencing data.

Two haplotypes are identical by descent (IBD)
if they share a segment that
both inherited from a common ancestor.
Current IBD methods reliably detect long IBD segments
because many minor alleles in the segment are concordant
between the two haplotypes.
However, many cohort studies contain unrelated individuals
which share only short IBD segments.
Short IBD segments contain too few minor alleles
to distinguish IBD from random allele sharing by recurrent mutations.
New sequencing techniques improve the
situation by providing rare variants which
convey more information on IBD than common variants,
because random minor allele sharing
of rare variants is less likely than for common variants.

Short IBD segments are of interest because
(i) they resolve the genetic structure on a fine scale
and (ii) they can be assumed to be old.
In order to detect short IBD segments, both the information supplied by rare
variants and information from more than two individuals should be
utilized.
The probability of a segment being IBD is typically computed via
the probabilities of randomly sharing single alleles within the
segment.
The probability of randomly sharing a single allele depends
(1) on the allele frequency,
where lower frequency means lower probability of random sharing, and
(2) on the number of individuals that share the allele, where more
individuals means lower probability of random sharing.
Therefore a segment that contains rare variants and is
shared by more individuals has higher significance of being IBD.
These two characteristics
are the basis for detecting short IBD segments by HapFABIA.


\begin{figure*}[htb!]
\includegraphics[width=\textwidth]{./figures/pedi}
\caption{The IBD segment marked in yellow descended from a founder to
  different individuals. \label{fig:pedi}}
\end{figure*}
%\end{figure}

We propose biclustering \cite{Hochreiter:10}
to detect very short IBD segments that are shared
among multiple individuals.
Biclustering simultaneously clusters rows and columns of a matrix.
In particular it clusters row elements that
are similar to each other on a subset of column elements.
A genotype matrix has individuals (unphased) or chromosomes (phased)
as row elements and SNVs as column elements.
Entries in the genotype matrix usually count how often
the minor allele of a particular SNV is present in
a particular individual.
Alternatively, minor allele likelihoods or dosages may be used.
Individuals that share an IBD segment are similar to
each other at minor alleles of SNVs (tagSNVs) which tag
the IBD segment (see Fig.~\ref{fig:bicluster}).
Therefore an IBD segment that is shared among individuals
corresponds to a bicluster because these individuals are similar to
one another at this segment.
Identifying a bicluster means identifying tagSNVs
(column bicluster elements) that tag an
IBD segment and, simultaneously, identifying
individuals (row bicluster elements) that possess the IBD segment.



\begin{figure*}[htb!]
\includegraphics[width=\textwidth]{./figures/plotB1}
\caption{Biclustering of a genotyping matrix.
Left: original genotyping data matrix with individuals as
  row elements and SNVs as column elements. Minor alleles
  are indicated by violet bars and major alleles by yellow bars for each
  individual-SNV pair.
Right: after sorting the rows, the detected bicluster
can be seen in the top three individuals.
They contain the same IBD segment which is marked in gold.
Biclustering simultaneously clusters rows and columns of a matrix so
that row elements (here individuals) are similar to each other
on a subset of column elements (here the tagSNVs). \label{fig:bicluster}}
\end{figure*}

In contrast to standard IBD detection methods,
biclustering considers multiple individuals.
In contrast to standard clustering,
biclustering allows for
SNVs or individuals that do not belong to any cluster or
to more than one bicluster.
Multiple cluster membership suits IBD detection because
diploid individuals can have
two IBD segments at one locus and
an SNV may tag more than one IBD segment.



FABIA is able to represent
homozygous regions (the same IBD segment in both chromosomes)
by means of its factors.
At a locus, overlapping IBD segments in one diploid individual
(a different IBD segment in each of the two chromosomes)
are represented through additivity of biclusters in the FABIA model.
Examples of short IBD segments found by hapFabia in chromosome~1 data from the
1000 Genomes Project are given in
Fig.~\ref{fig:example1} and Fig.~\ref{fig:example2}.

\begin{figure*}[p]
\includegraphics[width=0.9\textwidth]{./figures/example1}
\caption{Example of an IBD segment in chromosome~1 found in the
  1000 Genomes Project data. The $y$-axis
gives chromosomes and the $x$-axis consecutive SNVs. Yellow
indicates major alleles, violet minor alleles of tagSNVs, and blue
minor alleles of other SNVs.
``model L'' indicates tagSNVs
identified by hapFabia in violet.
A probable phasing error can be seen in line 3 and 4 at individual NA18522.
Another phasing error can be seen in the last but four and the last but five
line at individual NA19435. \label{fig:example1}}
\end{figure*}

\begin{figure*}[p]
\includegraphics[width=0.9\textwidth]{./figures/example8}
\caption{Another example of an IBD segment from chromosome~1 of
  the 1000 Genomes Project. See Fig.~\ref{fig:example1}
 for a description.
 Again probable phasing errors at individuals NA18516, NA19310, and
 NA19384.  \label{fig:example2}}
\end{figure*}

\clearpage

\section{Getting Started}

\subsection{Typical Analysis Pipeline}

First, we briefly describe a typical analysis pipeline.
Assume we have the genotype data of chromosome 1 in the file \texttt{filename.vcf.gz} in
compressed \texttt{vcf} format.
To prepare the data for \Rpackage{hapFabia} we have to perform
preprocessing steps.
First \texttt{filename.vcf.gz}
must be 1.\ uncompressed, then 2.\ converted to the sparse matrix
format, 3. copy genotype matrix to the matrix that is processed,
and then 4.\ split into intervals. The following command line
commands perform these steps:
\begin{enumerate}
\item \texttt{gunzip filename.vcf.gz}
\item \texttt{./vcftoFABIA filename ./}
\item \texttt{cp filename\_matG.txt filename\_mat.txt}
\item \texttt{./split\_sparse\_matrix filename \_mat.txt 10000 5000 1}
\end{enumerate}
In \texttt{inst/commandline/arch/} command line tools for
steps 2.\ to 4.\ are provided by the package \Rpackage{hapFabia}.
However step 2.\ to 4.\ can be performed in \R as well (see below).
The commandline parameters for \texttt{vcftoFABIA} are
\begin{enumerate}
\item filename without \texttt{.vcf}
\item path to the file, e.g.\ \texttt{./}
\item optional: \texttt{-s snps}, where snps gives the number of
    SNVs in the input data file
\item optional:  \texttt{-o outputFileName}, which gives the
      prefix of the output files.
\end{enumerate}

The commandline parameters for \texttt{split\_sparse\_matrix} are
\begin{enumerate}
\item filename without \texttt{.vcf}
\item extension (default\texttt{\_mat.txt})
\item interval length
\item shift size
\item indicator whether annotation is
present (is generated by  \texttt{vcftoFABIA} as default).
\end{enumerate}
The data is split into intervals of 10,000 SNVs where the distance
between adjacent intervals is 5,000 thus they overlap by 5,000 SNVs.

After providing the file \texttt{filename.vcf} the following steps
constitute a typical analysis pipeline in \R:
\begin{Sinput}
R> ##### define intervals, overlap, filename #######
R> shiftSize <- 5000
R> intervalSize <- 10000
R> fileName="filename" # without type
R>
R> ##### load library #######
R> library(hapFabia)
R>
R> ##### convert from .vcf to _mat.txt: step 2. above #######
R> vcftoFABIA(fileName=fileName)
R>
##### copy genotype matrix to matrix: step 3. above #######
R> file.copy(paste(fileName,"_matG.txt",sep=""),
+  paste(fileName,"_mat.txt",sep=""))
R>
R> ##### split/ generate intervals: step 4. above #######
R> split_sparse_matrix(fileName=fileName,intervalSize=intervalSize,
+  shiftSize=shiftSize,annotation=TRUE)
R>
R> ##### compute how many intervals we have #######
R> ina <- as.numeric(readLines(paste(fileName,"_mat.txt",sep=""),n=2))
R> noSNVs <- ina[2]
R> over <- intervalSize%/%shiftSize
R> N1 <- noSNVs%/%shiftSize
R> endRunA <- (N1-over+2)
R>
R> ##### analyze each interval #######
R> ##### may be done by parallel runs #######
R> iterateIntervals(startRun=1,endRun=endRunA,shift=shiftSize,
+  intervalSize=intervalSize,fileName=fileName,individuals=0,
+  upperBP=0.05,p=10,iter=40,alpha=0.03,cyc=50,IBDsegmentLength=50,
+  Lt = 0.1,Zt = 0.2,thresCount=1e-5,mintagSNVsFactor=3/4,
+  pMAF=0.035,haplotypes=FALSE,cut=0.8,procMinIndivids=0.1,thresPrune=1e-3,
+  simv="minD",minTagSNVs=6,minIndivid=2,avSNVsDist=100,SNVclusterLength=100)
R>
R> ##### identify duplicates #######
R> identifyDuplicates(fileName=fileName,startRun=1,endRun=endRunA,
+  shift=shiftSize,intervalSize=intervalSize)
R>
R> ##### analyze results; parallel #######
R> anaRes <- analyzeIBDsegments(fileName=fileName,startRun=1,endRun=endRunA,
+  shift=shiftSize,intervalSize=intervalSize)
R> print("Number IBD segments:")
R> print(anaRes$noIBDsegments)
R> print("Statistics on IBD segment lengths in SNVs (all SNVs in the
IBD segment):")
R> print(anaRes$avIBDsegmentLengthSNVS)
R> print("Statistics on IBD segment lengths in bp:")
R> print(anaRes$avIBDsegmentLengthS)
R> print("Statistics on number of individuals that share an IBD segment:")
R> print(anaRes$avnoIndividS)
R> print("Statistics on number of IBD segment tagSNVs:")
R> print(anaRes$avnoTagSNVsS)
R> print("Statistics on MAF of IBD segment tagSNVs:")
R> print(anaRes$avnoFreqS)
R> print("Statistics on MAF within the group of IBD segment tagSNVs:")
R> print(anaRes$avnoGroupFreqS)
R> print("Statistics on number of changes between major and minor allele frequency:")
R> print(anaRes$avnotagSNVChangeS)
R> print("Statistics on tagSNVs per individual that shares an IBD segment:")
R> print(anaRes$avnotagSNVsPerIndividualS)
R> print("Statistics on number of individuals that have the minor allele of tagSNVs:")
R> print(anaRes$avnoindividualPerTagSNVS)
R>
R> ##### load result for interval 50 #######
R> posAll <- 50 # (50-1)*5000 = 245000: segment 245000 to 255000
R> start <- (posAll-1)*shiftSize
R> end <- start + intervalSize
R> pRange <- paste("_",format(start,scientific=FALSE),"_",
+  format(end,scientific=FALSE),sep="")
R> load(file=paste(fileName,pRange,"_resAnno",".Rda",sep=""))
R> IBDsegmentList <- resHapFabia$mergedIBDsegmentList # $
R>
R> summary(IBDsegmentList)
R> ##### plot IBD segments in interval 50 #######
R> plot(IBDsegmentList,filename=paste(fileName,pRange,"_mat",sep=""))
R>    ##attention: filename without type ".txt"
R>
R> ##### plot the first IBD segment in interval 50 #######
R>
R> IBDsegment <- IBDsegmentList[[1]]
R> plot(IBDsegment,filename=paste(fileName,pRange,"_mat",sep=""))
R>    ##attention: filename without type ".txt"
\end{Sinput}
First the packages \Rpackage{hapFabia} and \Rpackage{fabia} are loaded.
Then \Rfunction{vcftoFABIA} converts \texttt{filename.vcf}
to sparse matrix format giving:
\begin{itemize}
\item \texttt{filename\_matH.txt} (haplotype data),
\item \texttt{filename\_matG.txt} (genotype data),
\item \texttt{filename\_matD.txt} (dosage data),
\end{itemize}
together with the SNV annotation file and individual's label file:
\begin{itemize}
\item \texttt{filename\_annot.txt} and
\item \texttt{filename\_individuals.txt}.
\end{itemize}
The function \Rfunction{split\_sparse\_matrix} splits
the data into intervals.
The function \Rfunction{iterateIntervals} identifies IBD segments
in these intervals and stores the results in an EXCEL like
\texttt{.csv} format and as an \R data object.
The function \Rfunction{identifyDuplicates} marks and memorizes
duplicates of IBD segments which occur because the intervals overlap.
Next the function \Rfunction{analyzeIBDsegments} analyzes the results where
duplicates as marked in previous step are not considered.
Results are listed by \texttt{anaRes}.

The next example shows how to view all IBD segments of a segment, for
which we chose interval 50 which corresponds to chromosome 1 range
from 245,000 to 255,000 ($(50-1)*5000 = 245000$).
Then we plot a specific IBD segment,
in this case the first (\texttt{IBDsegmentList[[1]]}),
which can also be used to store a \texttt{.pdf} or a \texttt{.fig} for
editing with Xfig.
Examples of this plot function are given in
Fig.~\ref{fig:example1} and Fig.~\ref{fig:example2}.


An \R source file  \Rfunction{pipeline.R} of above pipeline
can be created and executed as follows:
\begin{Sinput}
R> makePipelineFile("filename",shiftSize=5000,
+  intervalSize=10000,haplotypes=FALSE)
R>
R> source("pipeline.R")
\end{Sinput}
NOTE: sourcing may take a while for large datasets.

The next example shows how to use the pipeline.


\subsection{Examples}

Next we show an example how to use  \Rpackage{hapFabia}.
This example shows how to run the whole pipeline if the genotype data
is given as \texttt{.vcf} file.
The data is first converted to a
sparse matrix format by \texttt{vcftoFABIA} and then
divided into overlapping intervals by
\texttt{split\_sparse\_matrix}.
Then the IBD segments are extracted by \texttt{iterateIntervals} and
duplicates due to overlapping intervals marked by \texttt{identifyDuplicates}.
Subsequently the IBD segments are analyzed by
\texttt{analyzeIBDsegments},
where only simple statistics are computed.


Work in a temporary directory.
<<gotoTempDir>>=
old_dir <- getwd()
setwd(tempdir())
@

First the package is loaded.
<<loadLibs>>=
library(hapFabia)
@

Load data and write to \texttt{vcf} file.
In a real application the data is already given, therefore this chunk
of code would not be necessary.
<<makeVCFfile>>=
data(chr1ASW1000G)
write(chr1ASW1000G,file="chr1ASW1000G.vcf")
@

Create the analysis pipeline, where intervals contain only 1,000 SNVs
(that is a length of about 100 kbps),
while default is 10,000 SNVs (that is about a length of 1 Mbp).

<<createPipeline>>=
makePipelineFile(fileName="chr1ASW1000G",shiftSize=500,
                 intervalSize=1000,haplotypes=TRUE)
@

Now the pipeline can be executed by sourcing it.
<<executePipeline,fig=FALSE,results=hide>>=
source("pipeline.R")
@

Next we list the files that were generated, where
\texttt{\_N1\_N2\_} indicates that the file contains information
concerning the interval that starts at N1 and ends at N2,
\texttt{\_ALL.Rda} stores just the number of individuals and the number of SNVs,
\texttt{\_individuals.txt} contains annotation for the individuals (in
particular their names),
\texttt{\_matG.txt}
denotes phased genotype data in sparse matrix format,
\texttt{\_matD.txt} contains the genotype data as dosage in sparse
matrix format,
\texttt{\_matG.txt} contains unphased genotype data in sparse
matrix format,
\texttt{\_annot.txt} supplies
information on the SNVs,
\texttt{\_resAnno.Rda} is the result from hapFabia,
the result is also available as  \texttt{csv} file with extension
\texttt{\_csv.txt}.

Following files have been generated:
<<listFiles>>=
list.files(pattern="chr1")
@

In the following we show the results of calling the function
\texttt{analyzeIBDsegments} in above pipeline:
<<showResultSummary>>=
print("Number IBD segments:")
print(anaRes$noIBDsegments)
print("Statistics on IBD segment length in SNVs (all SNVs in the IBD segment):")
print(anaRes$avIBDsegmentLengthSNVS)
print("Statistics on IBD segment length in bp:")
print(anaRes$avIBDsegmentLengthS)
print("Statistics on number of individuals belonging to IBD segments:")
print(anaRes$avnoIndividS)
print("Statistics on number of tagSNVs of IBD segments:")
print(anaRes$avnoTagSNVsS)
print("Statistics on MAF of tagSNVs of IBD segments:")
print(anaRes$avnoFreqS)
print("Statistics on MAF within the group of tagSNVs of IBD segments:")
print(anaRes$avnoGroupFreqS)
print("Statistics on number of changes between major and minor allele frequency:")
print(anaRes$avnotagSNVChangeS)
print("Statistics on number of tagSNVs per individual of an IBD segment:")
print(anaRes$avnotagSNVsPerIndividualS)
print("Statistics on number of individuals that have the minor allele of tagSNVs:")
print(anaRes$avnoindividualPerTagSNVS)
@




Next we load interval 5 and there the first  and second IBD segment
<<loadSegment5>>=
posAll <- 5
start <- (posAll-1)*shiftSize
end <- start + intervalSize
pRange <- paste("_",format(start,scientific=FALSE),"_",
                format(end,scientific=FALSE),sep="")
load(file=paste(fileName,pRange,"_resAnno",".Rda",sep=""))
IBDsegmentList <- resHapFabia$mergedIBDsegmentList
summary(IBDsegmentList)
IBDsegment1 <- IBDsegmentList[[1]]
summary(IBDsegment1)
IBDsegment2 <- IBDsegmentList[[2]]
summary(IBDsegment2)
@

Finally go back to old directory.
<<goBackToOldDir>>=
new_dir <- getwd()
setwd(old_dir)
@



Plot the first IBD segment in interval 5.
In the plot the $y$-axis
gives the individuals or the chromosomes
and the $x$-axis consecutive SNVs. The default color coding uses yellow
for major alleles, violet for minor alleles of tagSNVs, and blue
for minor alleles of other SNVs.
``model L'' indicates tagSNVs
identified by hapFabia in violet.
<<plotIBDsegment1,fig=TRUE>>=
plot(IBDsegment1,filename=paste(new_dir,"/",fileName,pRange,"_mat",sep=""))
@

Plot the second IBD segment in interval 5.
<<plotIBDsegment2,fig=TRUE>>=
plot(IBDsegment2,filename=paste(new_dir,"/",fileName,pRange,"_mat",sep=""))
@



Here an example with simulated data.

Work in temporary directory.
<<gotoTempDir>>=
old_dir <- getwd()
setwd(tempdir())
@

The data \texttt{simu} is loaded and
written into three files:
\begin{itemize}
\item \texttt{dataSim1fabia\_individuals.txt} (sample names),
\item \texttt{dataSim1fabia\_annot.txt} (SNV annotation information), and
\item \texttt{dataSim1fabia\_mat.txt} (the data in sparse matrix format).
\end{itemize}
These  files are generated by the standard pipeline by
\texttt{vcftoFABIA} and by \texttt{split\_sparse\_matrix}.



<<createData>>=
data(simu)
namesL <- simu[["namesL"]]
haploN <- simu[["haploN"]]
snvs <- simu[["snvs"]]
annot <- simu[["annot"]]
alleleIimp <- simu[["alleleIimp"]]
write.table(namesL,file="dataSim1fabia_individuals.txt",
    quote = FALSE,row.names = FALSE,col.names = FALSE)
write(as.integer(haploN),file="dataSim1fabia_annot.txt",
    ncolumns=100)
write(as.integer(snvs),file="dataSim1fabia_annot.txt",
    append=TRUE,ncolumns=100)
write.table(annot,file="dataSim1fabia_annot.txt", sep = " ",
    quote = FALSE,row.names = FALSE,col.names = FALSE,append=TRUE)
write(as.integer(haploN),file="dataSim1fabia_mat.txt",ncolumns=100)
write(as.integer(snvs),file="dataSim1fabia_mat.txt",
    append=TRUE,ncolumns=100)

for (i in 1:haploN) {

  a1 <- which(alleleIimp[i,]>0.01)

  al <- length(a1)
  b1 <- alleleIimp[i,a1]

  a1 <- a1 - 1
  dim(a1) <- c(1,al)
  b1 <- format(as.double(b1),nsmall=1)
  dim(b1) <- c(1,al)

  write.table(al,file="dataSim1fabia_mat.txt", sep = " ",
     quote = FALSE,row.names = FALSE,col.names = FALSE,append=TRUE)
  write.table(a1,file="dataSim1fabia_mat.txt", sep = " ",
     quote = FALSE,row.names = FALSE,col.names = FALSE,append=TRUE)
  write.table(b1,file="dataSim1fabia_mat.txt", sep = " ",
     quote = FALSE,row.names = FALSE,col.names = FALSE,append=TRUE)

}
@

Now the IBD segments can be extracted from the data:
<<callHapFabia,results=hide>>=
hapRes <- hapFabia(fileName="dataSim1fabia",prefixPath="",
   sparseMatrixPostfix="_mat",
   annotPostfix="_annot.txt",individualsPostfix="_individuals.txt",
   labelsA=NULL,pRange="",individuals=0,lowerBP=0,upperBP=0.15,
   p=10,iter=1,quant=0.01,eps=1e-5,alpha=0.03,cyc=50,non_negative=1,
   write_file=0,norm=0,lap=100.0,IBDsegmentLength=10,Lt = 0.1,
   Zt = 0.2,thresCount=1e-5,mintagSNVsFactor=3/4,pMAF=0.1,
   haplotypes=FALSE,cut=0.8,procMinIndivids=0.1,thresPrune=1e-3,
   simv="minD",minTagSNVs=6,minIndivid=2,avSNVsDist=100,SNVclusterLength=100)
@

<<SummaryHapFabia>>=
summary(hapRes$mergedIBDsegmentList)
@

To view the results the first IBD segment is plotted:
<<assignIBDsegmentList>>=
mergedIBDsegmentList <- hapRes$mergedIBDsegmentList # $
IBDsegment <- mergedIBDsegmentList[[1]]
@
<<goBackToOldDir>>=
new_dir <- getwd()
setwd(old_dir)
@

Again, in the plot the $y$-axis
gives the individuals or the chromosomes
and the $x$-axis consecutive SNVs. The default color coding uses yellow
for major alleles, violet for minor alleles of tagSNVs, and blue
for minor alleles of other SNVs.
``model L'' indicates tagSNVs
identified by hapFabia in violet.
<<plotIBDsegment3,fig=TRUE>>=
plot(IBDsegment,filename=paste(new_dir,"/dataSim1fabia_mat",sep=""))
@



Here another example with random data:
<<gotoTempDirSecond>>=
old_dir <- getwd()
setwd(tempdir())
@
<<secondExampleData>>=
simulateIBDsegmentsFabia(minruns=2,maxruns=2)
@
<<secondExampleCallHapFabia,results=hide>>=
hapRes <- hapFabia(fileName="dataSim2fabia",prefixPath="",
   sparseMatrixPostfix="_mat",
   annotPostfix="_annot.txt",individualsPostfix="_individuals.txt",
   labelsA=NULL,pRange="",individuals=0,lowerBP=0,upperBP=0.15,
   p=10,iter=1,quant=0.01,eps=1e-5,alpha=0.03,cyc=50,non_negative=1,
   write_file=0,norm=0,lap=100.0,IBDsegmentLength=10,Lt = 0.1,
   Zt = 0.2,thresCount=1e-5,mintagSNVsFactor=3/4,pMAF=0.1,
   haplotypes=FALSE,cut=0.8,procMinIndivids=0.1,thresPrune=1e-3,
   simv="minD",minTagSNVs=6,minIndivid=2,avSNVsDist=100,SNVclusterLength=100)
@
<<secondExampleSummaryHapFabia>>=
summary(hapRes$mergedIBDsegmentList)
@

<<secondExampleAssignIBDsegment>>=
mergedIBDsegmentList <- hapRes$mergedIBDsegmentList # $
IBDsegment <- mergedIBDsegmentList[[1]]
@
<<goBackToOldDirSecond>>=
new_dir <- getwd()
setwd(old_dir)
@
<<secondExamplePlot,fig=TRUE>>=
plot(IBDsegment,filename=paste(new_dir,"/dataSim2fabia_mat",sep=""))
@



\section{hapFabia Method}
\label{sec:methods}

Our novel method HapFABIA extracts short IBD
segments that are tagged by rare variants from large sequencing data.
In the first stage, HapFABIA applies FABIA biclustering to phased or unphased
genotype data. Biclustering extracts similarities between
individuals based on a subset of SNVs, but does not
consider that IBD segments consist of contiguous
nucleotides.
In the second stage, HapFABIA extracts IBD segments
from FABIA models by considering local tagSNV accumulations.
SNVs that tag an IBD segment are within this segment
and, therefore, accumulate locally.
It is important for the second stage that the SNVs which are
extracted in the first step are independent of their DNA location.
This justifies to use statistical methods for identifying local SNV accumulations
in FABIA models which would not be expected randomly.
Finally, HapFABIA prunes
spurious correlated SNVs from the extracted IBD segments and joins segments.
The two HapFABIA steps (biclustering and IBD segment extraction)
are described in the next two subsections.


\subsection{FABIA for genotype data}
\label{sec:fabia}

In the following, we describe the first step of HapFABIA.
We propose identifying similarities between individuals by
biclustering.
Biclustering simultaneously clusters rows and columns of a matrix.
More specifically, it clusters row elements that
are similar to each other on a subset of column elements.
An IBD segment corresponds to a bicluster because
individuals that possess the IBD segment are similar to each other at
this segment.
The similarity is given by identical minor alleles of tagSNVs.
Fig.~\ref{fig:bicluster} depicts a bicluster identified in
genotype data.


We employ the
``Factor Analysis for Bicluster Acquisition'' (FABIA)
biclustering model \cite{Hochreiter:10}.
In contrast to other biclustering methods such as BIMAX
\cite{Prelic:06} and QUBIC \cite{Li:09qubic}, FABIA can represent
homozygous regions (two times the same IBD segment in one diploid individual)
by its multiplicative bicluster model.
Furthermore, FABIA can represent overlapping IBD segments (a different IBD
segment on each chromosome)
by its additive biclusters.
FABIA can be applied to discrete phased or unphased genotype data but also to real values
that correspond to minor allele likelihoods or the minor allele dosages.
We use FABIA not only because it is well suited for genotyping data, but also
because it outperformed other biclustering methods
in extensive comparisons on different data sets \cite{Hochreiter:10}.


\subsubsection{FABIA describes genotype data by IBD segments}


FABIA describes an IBD segment in genotype data $\X$
by an outer product $\z \ \la^T$ of two vectors $\la$ and $\z$,
where the vector $\la$ indicates tagSNVs by non-zero values and the
vector $\z$ indicates individuals that possess the IBD segment.
FABIA can represent a
homozygous region of an IBD segment by $z=2$, that is, two occurrences
of an IBD segment in one diploid individual.
Fig.\ \ref{fig:vectors} visualizes this description of a genotype
matrix by one IBD segment as an outer product.


A diploid individual may possess two
IBD segments at a particular locus where genotyping
sums up the occurrences of minor alleles.
This fact is reflected by the additive FABIA model
which sums up bicluster contributions.
If we assume genotyping errors which are accounted for by $\Up$,
the FABIA model for genotype data $\X$ is
\begin{align}
\label{eq:expgen}
\X \ &= \ \sum_{i=1}^{p} \z_i \ \la_i^T \ + \ \Up \ = \  \Z \ \La \ +
\ \Up \ ,
\end{align}
where $\X \in \Rb^{l \times n}$ is the genotyping data; $\Z \in \Rb^{l
  \times p}$ is the matrix that indicates which individuals possess an
IBD segment; $\La \in \Rb^{p \times n}$ indicates IBD segment tagSNVs;
$\Up \in \Rb^{l \times n}$ is an additive noise term; $n$ is the number
of SNVs; $l$ is the number of individuals (or chromosomes for phased
genotypes); $p$ is the number of IBD segments;
$\la_i  \in \Rb^n$ is tagSNV indicator vector for the $i$-th IBD segment (the $i$-th
row of $\La$);
and $\z_i  \in \Rb^l$ corresponds to the number of times each of the $l$ individuals
contains the $i$-th IBD segment (the $i$-th column of $\Z$).
The additive noise $\Up$ not only covers genotyping errors but also
genotypes which cannot be explained by IBD segments.
Such unexplained genotypes may arise from recently acquired SNVs,
segments contained in only one individual, and
IBD segments that are too short, or segments that are shared by too few individuals
to be called.

According to Eq.~\eqref{eq:expgen}, the $j$-th genotype $\x_j$, i.e.,
the $j$-th column of $\X$, is
\begin{align}
\label{eq:expgen2}
\x_j\ = \ \sum_{i=1}^{p} \la_i \ z_{ij} \ + \ \epp_j \ = \ \La \
\tilde\z_j\ + \ \epp_j \ ,
\end{align}
where $\epp_j$ is the $j$-th column of the noise matrix $\Up$ and
$\tilde\z_j=(z_{1j},\dots,z_{pj})^T$ indicates which
IBD segments are present in genotype $\x_j$ ($j$-th column of
the matrix $\Z$ with length equal to the number of IBD segments $p$).
Recall that $\z_i=(z_{i1},\dots,z_{il})^T$ in Eq.~\eqref{eq:expgen}
corresponds to the number of times each of the $l$ individuals
contains the $i$-th IBD segment.


If we drop the index $j$ which indicates the individual,
the formulation in Eq.~\eqref{eq:expgen2} facilitates a generative
interpretation by a factor analysis model with $p$ factors:
\begin{align}
\label{eq:factormodel}
\x \ = \ \sum_{i=1}^{p} \la_i \ \tilde z_i  \ + \
    \epp \ = \ \La \ \tilde\z \ + \
    \epp \ ,
\end{align}
where $\x \in \Rb^n$ is the observed genotype.
The vector of
factors $\tilde\z=(\tilde z_1,\ldots,\tilde z_p)^T$
indicates which
IBD segments are present in genotype $\x$, where
component $\tilde z_i$ indicates how often the individual with
genotype $x$ possesses the $i$-th IBD segment.
$\epp \in \Rb^{n}$ is the additive noise.



As illustrated in Fig.\ \ref{fig:vectors},
both the vector $\z_i$
and the vector $\la_i$ should be sparse
to describe an IBD segment.
Sparse $\z_i$ means that only few individuals
possess the IBD segment, which implies rare tagSNVs.
Sparse $\la_i$ means that only few SNVs are tagSNVs, which implies
short IBD segments.
Sparse $\z_i$ can be achieved if components $z_{ij}$ are sparse,
that is, if  the vector of factors $\tilde \z_j$ in
Eq.~\eqref{eq:expgen2} is sparse or, equivalently,
the vector $\tilde\z$ in Eq.~\eqref{eq:factormodel}
is sparse.
In contrast to standard factor analysis, FABIA's model selection is
tailored to sparse factors and sparse
loadings, which are essential for IBD detection.
Sparseness in the FABIA model is obtained by a
component-wise independent Laplace
distribution both for the prior on the
parameters $\la_i$ and for the
distribution of the factors $\tilde\z$ \cite{Hyvarinen:99a}:
\begin{align} \label{eq:laplaceprior1}
p(\tilde \z) \ = \ \left( {\textstyle\frac{1}{\sqrt{2}}} \right)^p
\prod_{i=1}^{p}
    e^{-\ \sqrt{2} \ |\tilde z_i|} \\ \label{eq:laplaceprior2}
p(\la_i) \ = \ \left( {\textstyle\frac{1}{\sqrt{2}}} \right)^n
\prod_{k=1}^{n} e^{-\ \sqrt{2} \
|\lambda_{ki}|} \ .
\end{align}


The Laplace distribution of the factors leads to an analytically
intractable likelihood:
\begin{align}  \label{eq:likelihood}
p( \x \mid \La, \pp ) \ = \ \int p( \x \mid \tilde\z, \La, \pp) \ p(
\tilde\z) \ d\tilde\z \ .
\end{align}
Therefore, the model selection of FABIA is performed by means of
variational expectation maximization, which is
a variational optimization in
the expectation maximization (EM) framework
to maximize the posterior of the
parameters \cite{Girolami:01,Palmer:06,Hochreiter:10,Clevert:11,Klambauer:12}.
The idea of the variational approach is to express the prior $p(\tilde\z)$ by
the maximum
\begin{align}
 p(\tilde\z) \ = \ \underset{\xii}{\max} \ p(\tilde\z \mid \xii)
\end{align}
over a model family $p(\tilde\z \mid \xii)$
that is parametrized
by the variational parameter $\xii$
or by scale mixtures
\begin{align}
 p(\tilde\z) \ = \ \int p(\tilde\z \mid \xii) \ d\mu(\xii) \ .
\end{align}
A Laplace distribution can be expressed exactly by the maximum of a
Gaussian family or by Gaussian scale
mixtures \cite{Girolami:01,Palmer:06}.
Therefore, for each $\x$, the maximum $\hat \xii$ of the variational parameter
$\xii$ allows for representing the Laplacian prior by a Gaussian:
\begin{align}
 \hat \xii \ = \ \underset{\xii}{\arg\max} \ p(\xii \mid \x) \ .
\end{align}
The maximum $\hat \xii$ can be computed analytically
(see Eq.~\eqref{eq:updateVar} below)
because for each Gaussian
the likelihood Eq.~\eqref{eq:likelihood} can be computed analytically.



If we denote the $j$-th genotype by $\x_j \in \Rb^n$
with corresponding factors $\z_j \in \Rb^p$, then
we obtain the following variational E-step \cite{Hochreiter:10}:
\begin{align} \label{eq:updateE1}
\E \big(\z_j   \mid \x_j \big) \ &= \  \big( \La^T \
  \pp^{-1} \ \La \
  + \  \Xii_j^{-1} \big)^{-1} \ \La^T \  \pp^{-1} \ \x_j \ , \\ \label{eq:updateE2}
\E \big(\z_j \ \z_j^T  \mid  \x_j \big) \ &= \  \big(
\La^T \
  \pp^{-1} \ \La \
  + \  \Xii_j^{-1} \big)^{-1} \ + \
 \E \big(\z_j   \mid  \x_j \big)  \ \E (\z_j  \mid
\x_j)^T \ ,
\end{align}
where $\Xii_j$ means $\mathrm{diag} \left(\xii_j \right)$. The
update for the variational parameter $\xii_j$ is
\begin{align}
\label{eq:updateVar}
\xii_{j} \ &= \  \mathrm{diag}\left( \sqrt{\E (\z_j \
\z_j^T \mid \x_j)} \right)\ .
\end{align}
The variational M-step is \cite{Hochreiter:10}
\begin{align} \label{eq:updateM1}
\La^{\nn} \ &= \  \frac{\frac{1}{l} \ \sum_{j=1}^{l} \x_j \ \E
(\z_j \mid  \x_j)^T  \ - \ \frac{\alpha}{l} \  \pp \ \sign
(\La
  )}{\frac{1}{l} \sum_{j=1}^{l} \E (\z_j \ \z_j^T \mid \x_j
  )} \\ \label{eq:updateM2}
\mathrm{diag} \left(\pp^{\nn}  \right) \ &= \ \pp^{\mathrm{EM}} \ +
\ \mathrm{diag} \Big(\frac{\alpha}{l} \  \pp \ \sign (\La
  ) ( \La^{\nn} )^T \Big)\ , \\ \label{eq:updateM3}
\pp^{\mathrm{EM}} \ &= \ \mathrm{diag} \bigg( \frac{1}{l}
\sum_{j=1}^l \x_j  \ \x_j^T \ - \ \La^{\nn}  \frac{1}{l} \sum_{j=1}^l
 \E \left(  \z_j   \mid  \x_j \right) \ \x_{j}^{T} \bigg) .
\end{align}
The parameter $\alpha$ controls the degree of sparseness (an
expectation of how rare the SNVs that tag the IBD segment are)
and can be
introduced as a parameter of
the Laplacian prior of the factors \cite{Hochreiter:10}.

Note that the number of bicluster need not be determined a priori if
$p$ is chosen large enough. The sparseness constraint will remove
spurious biclusters by setting $\la$ to the zero vector. In this way,
FABIA automatically determines the number of biclusters.


\subsubsection{Adaptation of FABIA for IBD detection}

Since an entry in the genotype matrix $\X$
reports how often the minor
allele is present, FABIA must
explain occurrences of minor alleles by IBD segments.
The genotype matrix $\X$ is non-negative as
are both the indicator matrix of tagSNVs $\La$
and indicator matrix of IBD segments in individuals $\Z$.
Regarding these constraints,
we modified FABIA to enforce non-negative
loadings $\La$.
If, for individual $j$,
both genotype data $\x_j$ and $\La$ are non-negative, the posterior mean
$\E \big(\z_j   \mid \x_j \big)$ of $\z_j$ is non-negative, too.
Consequently, the new loading matrix
$\La^{\nn}$ is non-negative according to Eq.~\eqref{eq:updateM1}.
Note that the prior term $\frac{\alpha}{l} \  \pp \ \sign (\La)$ in the
update rule Eq.~\eqref{eq:updateM1} is not allowed to change
the signs of the loadings $\La$ in one update step.
Therefore, it is sufficient to initialize $\La$ by positive values
to enforce non-negative factors and loadings.
$\Z$ is estimated by the posterior mean $\E \big(\z_j   \mid \x_j \big)$
and is used to identify individuals that possess an IBD segment.

To allow IBD segment detection in large sequencing data,
we developed a specialized sparse matrix algebra.  This sparse matrix algebra
only considers non-zero values with their indices for vector and
matrix computations.
In  Eqs.~\eqref{eq:updateE1}-\eqref{eq:updateM3} the values of
the genotype vectors $\x_j$ and the values of the
loading matrix $\La$ are sparse. Our sparse matrix algebra
not only allows for multiplying a sparse vector by a dense vector (as
usual in sparse matrix computations), but also for multiplying two sparse
vectors.

To further speed up the computation, we extended FABIA to an iterative
version, where, in each iteration, $p$ biclusters are detected.
These $p$ biclusters are removed from the genotype matrix $\X$ before
starting the next iteration.


The vectors $\la_i$ and $\z_i$ acquire a new
interpretation when detecting short IBD segments.
Components of $\la_i$ indicate to which degree tagSNVs tag the $i$-th
IBD segment. In particular, these components measure how many
individuals possess the IBD segment (more precisely the corresponding tagSNV).
Components of $\z_i$ indicate how often a specific IBD segment is
present in one multiploid individual.
Additionally components of $\z_i$ also indicate how well an individual
segment matches an IBD segment as individual segments may contain
genotyping errors or may be broken up during meiosis.



\subsection{Extraction of IBD segments from FABIA models}
\label{sec:extract}

FABIA biclustering does not consider that an IBD segment
consists of contiguous nucleotides.
This essential information is incorporated into HapFABIA in the second
stage.
SNVs that tag an IBD segment are locally accumulated.
FABIA may have accidentally merged separated IBD segments that are shared by the same
individuals or IBD segments that are located on different
chromosomes.
The second stage, therefore disentangles falsely merged
IBD segments, prunes IBD
segments from spurious SNVs, and finally joins parts of single long IBD
segments.



As mentioned above,
FABIA biclustering does not regard the order of SNVs or individuals,
thus random shuffling of SNVs does not change its result.
Therefore, randomly correlated SNVs that are found by FABIA would be
uniformly distributed along the chromosome.
However, SNVs that are correlated because they tag an IBD segment
agglomerate locally in this segment.
Deviations from the null hypothesis of uniformly distributed SNVs can
be detected by a binomial test for the number of expected SNVs within an
interval if the minor allele frequency of SNVs is known.
A low $p$-value hints at local agglomerations of bicluster SNVs
stemming from an IBD segment.


We propose a four-step procedure to extract IBD segments from FABIA
models:
\begin{enumerate}
\item
Identify local agglomerations of correlated SNVs based on a
binomial test;
\item
Disentangle IBD segments and re-assigning individuals or chromosomes
to IBD segments;
\item
Prune IBD segments off SNVs with spuriously correlations based
on an exponential test for a long genomic distance;
\item
Join similar IBD segments stemming from long IBD segments
that were divided by the bins at the first step.
\end{enumerate}

{\bf Step 1:}
FABIA model selection is independent of the
order of the SNVs.
Therefore, spuriously correlated SNVs are unlikely to
agglomerate at a DNA locus, whereas tagSNVs do, as they are within
an IBD segment.
To detect agglomerations, we compute histogram counts of FABIA model SNVs
within bins where bins with
large counts are assumed to contain IBD segments.
For computing the histogram of counts of FABIA model SNVs, we consider
those SNVs for which the FABIA model parameter $\la_i$ is largest
(threshold ``Lt'' with 10\% being the default value).
Large $\la$ values ensure IBD segments that are shared by multiple
individuals. These segments are therefore more reliable.
The HapFABIA parameter ``IBDlength''
determines the maximal length of IBD segments.
The histogram bin size in number of SNVs (all SNVs and not only model
SNVs) is computed from ``IBDlength''
using the average genomic distance between adjacent SNVs.
To account for IBD segments that exceed the borders of the bins,
the histogram is
computed a second time with bins shifted by half the bin size.

The histogram bins with more model SNVs than expected by chance
are assumed to contain IBD segments.
We select bins for which the model SNV count
exceeds the expected value by a binomial test for random counts.
We need to compute how many model SNVs are expected in a bin if they
are random and not in an IBD segment. Thus, we have to compute the
probability of observing $k$ or more bin counts by chance.
Let $p$ be the probability of a random
minor allele match between $t$ individuals.
If $n$ SNVs are in a bin,
the probability of observing $k$ model SNVs by chance
is given by one minus the binomial distribution $F(k;n,p)$:
\begin{align}
\label{eq:binomial}
1 \ - \ F(k-1;n,p) \ &= \ \Pr(K \geq k) \ = \ \sum_{i=k}^n \ {n\choose i} \
p^i \ (1-p)^{n-i}  \ .
\end{align}
If $q$ is the minor allele frequency (MAF) for one SNV,
the probability $p$ of observing the minor allele of this SNV in
all $t$ individuals is $p=q^t$.
We assumed that all SNVs have the same MAF $q$ --- in the
experiments we used the average MAF.
For $b$ bins, the probability of observing $k$ or more counts of model
SNVs by chance in at least one bin is
\begin{align}
\label{eq:counts}
b \ {l\choose t} \ \sum_{i=k}^n \ {n\choose i} \
q^{it} \ \left(1-q^t \right)^{n-i} \ ,
\end{align}
where $l$ is the number of individuals and  ${l\choose t}$ is the
number of possibilities to chose $t$ individuals from the $l$ individuals.
If the probability in Eq.~\eqref{eq:counts} is below the threshold ``thresCount'',
the according bin is selected for IBD segment extraction because more
FABIA model SNVs are in this bin than expected by chance.
If $k_{\textrm{min}}$ is the minimum $k$ for which
Eq.~\eqref{eq:counts} is below the threshold ``thresCount'', then all
bins with model SNV counts $k \geq k_{\textrm{min}}$ are selected.
In our experiments, we allow IBD segments of only two individuals
(standard IBD), and therefore set $t=2$.



If a bin is selected, SNVs and individuals must be assigned to it.
Note that bicluster memberships of FABIA biclusters cannot be used directly
because they include all bins and therefore different IBD segments.
First, those model SNVs that contributed to the count of the selected bin are assigned to it.
Then, individuals or chromosomes are assigned to the selected bin
if they possess a minor allele at one or more SNVs that have been assigned
to the bin.
Individuals are only chosen from the top $z$-values of the FABIA model
to ensure that assigned individuals are indeed similar to each other.
The parameter ``Zt'' (default 0.2)
gives the percentage of top $z$-scores that are considered.





{\bf Step 2:}
In this step, IBD segments in a selected bin are disentangled, where only
SNVs and individuals are considered that have been assigned to the bin.
An IBD segment is initialized by
two core individuals that are identical at $m$ or more minor alleles.
The number $m$ is computed as $m = \textrm{mintagSNVsFactor} \times
k_{\textrm{min}}$.
All individuals that are identical in at least $m$ minor alleles
to one of the two IBD core individuals are
classified as possessing the IBD segment.
The tagSNVs of this IBD segment are
model SNVs that have their minor allele in at least 2
individuals that possess the IBD segment.

Step 2 is repeated after removing the current IBD segment by
deleting the segment's tagSNVs
until no more core individuals are found.


{\bf Step 3:} This step prunes IBD segment borders of SNVs that have
spurious correlations to the IBD segment.
Spurious correlations may still be present in a bin leading to an
overestimation of the segment length.
Such SNVs can be identified by deviations of their MAFs
from those of other tagSNVs.
However, this criterion is not reliable for rare SNVs.
Therefore, we identify SNVs with spurious correlations to an IBD segment
on the basis of unusually large distances to other tagSNVs.
The deviation from an expected distance
is quantified by means of an exponential distribution with the median distance
between tagSNVs as parameter.
SNVs with distances leading to $p$-values below 1e-3 are removed.
The two furthest upstream and the two furthest downstream tagSNVs
are tested for their distances to other
tagSNVs. If the second-furthest
up- or downstream tagSNV is removed, then
the furthest up- or downstream tagSNV is removed, too.

{\bf Step 4:}
IBD segments which are very similar to each other are merged.
In this way, long IBD segments that were divided by the bins into smaller
parts are reconstructed.
Note that IBD segments longer than given by ``IBDlength'' can be detected.
In order to compute similarities, we
assess how many tagSNVs and individuals of the smaller IBD segment are
explained by the larger IBD segment. This criterion is expressed by the
``overlap coefficient''
\begin{align}
\label{eq:dmin}
O(A,B) \ &= \ {{|A \cap B|}\over{\min\{|A|,|B|\}}} \ .
\end{align}
Using the overlap coefficient for both tagSNVs and individuals,
we define a distance-like measure between
IBD segments $\mathrm{IBD}_1$ and $\mathrm{IBD}_2$ by
\begin{align}
\label{eq:dist}
D(\mathrm{IBD}_1,\mathrm{IBD}_2) \ &= \ 1 \ - \ O(S_{\mathrm{IBD}_1},S_{\mathrm{IBD}_2}) \ O(I_{\mathrm{IBD}_1},I_{\mathrm{IBD}_2}) \ ,
\end{align}
where $S_{\mathrm{IBD}_i}$ and $I_{\mathrm{IBD}_i}$ are the tagSNVs
that tag IBD segment
$\mathrm{IBD}_i$ and individuals possessing IBD segment $\mathrm{IBD}_i$, respectively.
Using the measure $D$, IBD segments
are clustered by hierarchical clustering using complete linkage.
IBD segments are merged if their segments
are clustered together below a cutting height of 0.8.



\subsection{Further Advantages of HapFABIA}
\label{sec:advant}

HapFABIA further has the following two advantages
over existing IBD detection methods:
\begin{enumerate}
\item
If an IBD segment is known, but its tagSNVs are unknown,
existing IBD detections methods have still difficulties to
identify IBD sharing among individuals.
Most tagSNVs are separated by common, private, or random SNVs,
but also by tagSNVs of other IBD segments. Moreover, the distances
between tagSNVs of an IBD segment vary much, both in terms of
genomic distance and the number of tagSNVs between them. These complex
data characteristics impair the detecting abilities of existing IBD
detection methods. HapFABIA's biclustering approach, however, does not
consider the order of SNVs in first place, therefore, it is well
suited for detecting non-consecutive tagSNVs with varying distances
between them.
\item
HapFABIA is well suited both for phased and unphased genotype
data. It represents homozygous regions (i.e.\ two occurrences of an IBD
segment in one diploid individual) by means of its factor. Overlapping
IBD segments in a diploid individual (i.e.\ different IBD segments on
each chromosome) can be represented by the additive FABIA model. The
results of HapFABIA for phased and unphased genotypes hardly differ
for short IBD segments because homozygous regions are rarely
observed. Another important aspect is that HapFABIA can also be
applied to minor allele likelihoods or minor allele dosages. We
applied HapFABIA to data from the Korean Personal Genome Project
(KPGP, \url{http://opengenome.net}), which was merged with the 1000
Genomes Project data. In this analysis we only used unphased genotype data
and re-discovered the IBD segments which were already found
in the 1000 Genomes Project data. Additionally, we discovered more Asian specific
IBD segments. Results can be found at
\url{http://www.bioinf.jku.at/research/short-IBD}.
\end{enumerate}


\section{Tools to Analyze \Rpackage{fabia} Results}
\label{sec:tools}

To analyze the \Rpackage{fabia} results we provide some
functions. This might be convenient if parameters are optimized
for a specific data set.

Accumulations of \Rpackage{fabia} loadings can be
given as histogram counts to see
locations of accumulations:
<<histograms>>=
data(res)
h1 <- histL(res,n=1,p=0.9,w=NULL,intervv=50,off=0)
print(h1$counts)
h1 <- histL(res,n=1,p=NULL,w=0.5,intervv=50,off=0)
print(h1$counts)
@


\Rpackage{fabia} loadings can be plotted to identify locations of
accumulations:
<<plotLhistP,fig=TRUE>>=
data(res)
plotL(res,n=1,p=0.95,w=NULL,type="histogram",intervv=50,off=0,t="p",cex=1)
@
<<plotLpointsP,fig=TRUE>>=
data(res)
plotL(res,n=1,p=0.95,w=NULL,type="points",intervv=50,off=0,t="p",cex=1)
@
<<plotLpointsW,fig=TRUE>>=
data(res)
plotL(res,n=1,p=NULL,w=0.5,type="histogram",intervv=50,off=0,t="p",cex=1)
@
<<plotLsmoothP,fig=TRUE>>=
data(res)
plotL(res,n=1,p=0.95,w=NULL,type="smooth",intervv=50,off=0,t="p",cex=1)
@
<<plotLsmoothW,fig=TRUE>>=
data(res)
plotL(res,n=1,p=NULL,w=0.5,type="smooth",intervv=50,off=0,t="p",cex=1)
@


Finally the largest \Rpackage{fabia} loadings $L$ and factors $Z$
can be listed. The largest values must exceed a threshold either
given by quantile $p$ or a value $w$:
<<topLZ>>=
data(res)

topLZ(res,n=1,LZ="L",indices=TRUE,p=0.95,w=NULL)
topLZ(res,n=1,LZ="L",indices=TRUE,p=NULL,w=0.95)

topLZ(res,n=1,LZ="Z",indices=TRUE,p=0.95,w=NULL)
topLZ(res,n=1,LZ="Z",indices=TRUE,p=NULL,w=0.4)

topLZ(res,n=1,LZ="L",indices=FALSE,p=0.95,w=NULL)
topLZ(res,n=1,LZ="L",indices=FALSE,p=NULL,w=0.95)

topLZ(res,1,LZ="Z",indices=FALSE,p=0.95,w=NULL)
topLZ(res,1,LZ="Z",indices=FALSE,p=NULL,w=0.4)
@



\bibliographystyle{natbib}

\bibliography{ibd}




\end{document}