\documentclass[a4paper]{article}
\pagestyle{myheadings}
\usepackage{haskell}
\begin{document}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Main}

This implements a complete multiple-matching clustering algorithm. 

There are several output options, defined in different {\tt
main}-functions.  The most useful are probably

\begin{itemize}
\item {\tt main\_labels (-L)} print labels, one cluster on each line
\item {\tt main\_ug (-U)} output clusters in UniGene style
\item {\tt main\_matches (-M)} output each match (takes much less space)
\end{itemize}

\subsection{Imports}

Module declaration.  TODO: Haddockize it.

\begin{code}

module Main where
import Gene
import EST    (EST(..))
import Fasta  (parse, bmfparser, ugparser,FHParser,readDataLower)
import Suffix (Suffix(..), suffixesBy)
import Pairs  (pairs, matches, sort_io, sort_pure, 
	       show_header,show_regs,prefixes,merge_suffix_lists)
import Cluster(cluster_simple,cluster_all,
	       newick,labels,unigene)    -- output
import Stats
import Clix   (clix)

import Char   (isDigit)
import List   (isPrefixOf)
import System
import System.Console.GetOpt
import System.IO     (hPutStrLn,stderr) -- for usage
type ParseF = Fasta.FHParser

\end{code}

\subsection{Main}

The {\tt main} function is just a simple wrapper.

\begin{code}

main :: IO ()
main = do
    -- read k, n and file names from args
    args <- getArgs
    let (opt,non,err) = getOpt Permute options args 
    if err == [] && length non == 1 && validate opt 
	then main_real1 (parseOpts opt) (head non)
	else usage err

\end{code}
\newpage
\subsection{The "real" main}

Where all the hard work is done...

\begin{code}

main_real1 :: Params -> String -> IO ()
main_real1 params file = do
    let (k,n) = (kval params,nval params)
    indata <- readFile file
    let es = parse (if upper_only params then ugparser else bmfparser) indata
    let sa = mkSufList (kval params) (match_only params) (preflen params) es
    let ms = matches sa
    let csizes = map (fromIntegral . length) sa
    let output = writer params
    -- make sure nothing needs be retained when matches are calculated
    case oflag params of 
        Stats -> output ("PARAMETERS: k="++show k++", n="++show n 
		   -- ++"\nSequences: "++ show (length es))
		   ++"\nCliques statistics:\n"
		   ++ (show $ Stats.uniVar csizes) ++"\n"
		   -- ++ (show $ Stats.quantiles csizes) ++ "\n"
		   )
	_ -> do
            let sort_fn = if extsort params 
			  then sort_io es (file++"-"++show k ++
			       concatMap show (match_only params))
			  else (return . sort_pure)
	    sms <- case collect params of
 		       Just x -> merge_suffix_lists es x (file++"-"++show k)
  		       Nothing -> sort_fn ms
	    if match_only params /= [] then (sms `seq` return ())
	      else do
	        let ps = pairs (noLCR params) sms
	        case oflag params of
		    Matches -> output (unlines $ map show sms)
		    Pairs   -> output $ unlines $ map show ps
		    PHead   -> output $ unlines $ map show_header ps
		    PRegs   -> output $ unlines $ map show_regs ps
		    Tree    -> output $ unlines $ map newick  
			       $ cluster_all (addSgl params) es n ps
		    UG      -> output $ unlines $ map unigene 
			       $ cluster_all (addSgl params) es n ps
		    Labels  -> output $ unlines $ filter (/="") $ map labels
			       $ cluster_simple (addSgl params) es n ps
		    _       -> error "Unsupported output parameter!\n"

\end{code}
\subsection{Helper functions}

A function (there used to be more, but what can I say?) useful for
{\tt main}

\begin{code}

-- | create the suffix list, with matching scores, by prefix
mkSufList    :: Int -> [Gene] -> Int -> [EST] -> [[Suffix]]
mkSufList k ms p ests = concatMap (clix k) 
	 [[concatMap (suffixesBy ss) ests] |
	  ss <- filter (ms `isPrefixOf`) $ prefixes p]

\end{code}

\newpage
\subsection{Option handling}

Data structures and code for dealing with options (GetOpt style).
(Should we make the Params data structure a grammar only allowing
legal combinations of parameters?)

\begin{code}

data OFlag = UG | Labels | Stats | Pairs | Tree | Matches | PHead | PRegs
	     deriving Eq

data Params = Params { kval, nval, preflen :: Int
		     , oflag      :: OFlag
		     , upper_only :: Bool
		     , extsort    :: Bool
		     , noLCR      :: Bool
		     , addSgl     :: Bool
		     , writer     :: String -> IO ()
		     , match_only :: [Gene]
		     , collect    :: Maybe Int
--	      , ifile   :: String -- [FilePath]?
		     }

default_par = Params { preflen = 1, nval = 65, oflag = Labels
		     , extsort = False, upper_only = False 
		     , match_only = [], collect = Nothing
		     , writer = putStr, noLCR = False, addSgl = False
		     , kval = 20
		     }

data Flag = KVal String
	  | NVal String
	  | PrefLen String
	  | MatchOnly String
	  | Collect String
	  | InUG
	  | AS
	  | NoLCR
	  | ExtSort
	  | OutMatch | OutUG | OutLab | OutStats | OutPairs | OutRegs
	  | OutTree | OutPHead | OutFile String deriving Show

validate :: [Flag] -> Bool
validate args =  and (map validate1 args) && validate2 args
    where validate1 arg = case arg of
			KVal s -> (and $ map isDigit s)
			NVal s -> (and $ map isDigit s)
			PrefLen s -> (and $ map isDigit s)
			MatchOnly s -> (and $ map (`elem` "ACGTacgt") s)
			Collect s -> (and $ map isDigit s)
			_ -> True
	  validate2 _args = True -- todo: check if consistent

usage :: [String] -> IO ()
usage errs = hPutStrLn stderr (usageInfo (concat errs ++ usageString) options)

usageString :: String
usageString = 
      "\nUsage: xsact -k K [-n N] [-p PL] [-x]" ++
      " [-M|-U|-L|-S|-P|-H|-N] [-u] [-i] [-s] [-o FILE] FILE\n" ++
      "Or for all PFXes: xsact -k K [-p PL] -x -m PFX FILE\n" ++
      "Followed by:      xsact -k K [-n N] -c PFXLEN -x" ++
      " [-M|-U|-L|-S|-P|-N] [-u] [-i] [-s] [-o FILE] FILE\n"

\end{code}
\newpage

\subsection{Option definitions}

Data for option definitions, as required by GetOpt.

\begin{code}

options :: [OptDescr Flag]
--           short long           result                description
options = [
    -- algorithm options
      Option ['k'] ["block-size"] (ReqArg KVal "INT")  
      "Matching block size (default is 16)"
    , Option ['n'] ["threshold"]     (ReqArg NVal "INT")  
      "Required clustering threshold"
    , Option ['p'] ["prefix-length"] (ReqArg PrefLen "INT")
      ("Prefix length defining suffix blocks "++
      "(increase to reduce memory footprint, decrease to increase speed)")
    , Option ['x'] ["external-sort"] (NoArg ExtSort) 
      "use external sorting of matches, reducing memory footprint"
    -- input options
    , Option ['u'] ["input-upper"] (NoArg InUG)
      "accept only upper case characters"
    , Option ['m'] ["match-only"] (ReqArg MatchOnly "STR")
      "only generate matches starting with PREFIX"
    , Option ['c'] ["collect"] (ReqArg Collect "INT")
      "collect separately generated matches"
    , Option ['i'] ["dont-ignore-lcr"] (NoArg NoLCR)
      "don't ignore low complexity regions"
    -- output options
    , Option ['s'] ["add-singletons"] (NoArg AS)
      "include singleton clusters in the output"
    , Option ['M'] ["output-matches"] (NoArg OutMatch)
      "output all matches detected"
    , Option ['N'] ["output-newick"] (NoArg OutTree)  
      "output a newick-formatted tree"
    , Option ['U'] ["output-unigene"] (NoArg OutUG)  
      "output clustering UniGene style"
    , Option ['L'] ["output-labels"] (NoArg OutLab)  
      "output clustering as a list of labels"
    , Option ['S'] ["output-statistics"] (NoArg OutStats)  
      "output statistics"
    , Option ['P'] ["output-pairs"] (NoArg OutPairs)  
      "output sequence pairs with match information (slow for large data sets)"
    , Option ['H'] ["output-headers"] (NoArg OutPHead)  
      "output sequence pair headers (fast alternative to -P)"
    , Option ['R'] ["output-regions"] (NoArg OutRegs)
      "output matching regions for each sequence" 
    -- output file
    , Option ['o'] ["output"] (ReqArg OutFile "FILE") 
      "output file name (default is stdout)"
    ]

-- return values for k and n, and choices of main and output functions
parseOpts :: [Flag] -> Params
parseOpts = foldr po default_par
    where
    po f params = case f of
	       OutFile fn  -> params {writer = writeFile fn}
	       InUG        -> params {upper_only = True}
	       NoLCR       -> params {noLCR = True}
	       AS          -> params {addSgl = True}
	       OutMatch    -> params {oflag = Matches}
	       OutTree     -> params {oflag = Tree}
	       OutUG       -> params {oflag = UG}
	       OutLab      -> params {oflag = Labels}
	       OutStats    -> params {oflag = Stats}
	       OutPairs    -> params {oflag = Pairs}
	       OutPHead    -> params {oflag = PHead}
	       OutRegs     -> params {oflag = PRegs}
	       KVal s      -> params {kval = read s}
	       NVal s      -> params {nval = read s}
	       PrefLen p   -> params {preflen = read p}
	       ExtSort     -> params {extsort = True}
	       MatchOnly s -> let p = params {match_only = readDataLower s,
					      extsort = True}
				  in if preflen p < length s
				     then p { preflen = length s } else p
	       Collect i -> params {collect = Just (read i)}

\end{code}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}