\section{MaskCount}

Examine each nucleotide from two data sets, and count the Ns (or Xs?).
Separate into A AB B 0.  Also (optionally?) count masked region sizes
to generate gnuplot'able histograms.  

TODO:
   - parameter to specify lower case/N/X as masked
   - justify columns in output
   - output percentages

\begin{code}

module Main where

import Bio.Sequence.Fasta
import Bio.Sequence.HashWord (isN)
import Char (isLower)
import System (getArgs,exitWith,ExitCode(..))
import Data.List (foldl',intersperse,partition)
import Control.Monad
import TextFormat
import Text.Printf

main :: IO ()
main = do
       (opts,[a,b]) <- parseArgs
       as <- readSeqs a
       bs <- readSeqs b
       let res = zipComp as bs
       when (elem 'l' opts) 
         (do
          putStrLn ("# fraction of masked positions: "++a++" both "++b++"none")
          putStrLn $ unlines $ map printres res)
       when (elem 's' opts)
         (do
          putStrLn "# Summary:"
          putStrLn $ printSum a b $ foldl' add (CS 0 0 0 0) (map snd res))

parseArgs = do
       as <- getArgs
       let (opts,files) = partition ((=='-').head) as
       when (length files /= 2) usage
       let short = concat $ map tail opts
       return (short,files)

usage = do putStrLn "mc: compare two FASTA files for masked nucleotides"
           putStrLn "usage: mc [-s|-l] file_1 file_2"
           exitWith $ ExitFailure 1

data CompStruct = CS !Int !Int !Int !Int

add (CS a ab b o) (CS a' ab' b' o') = CS (a+a') (ab+ab') (b+b') (o+o')

-- construct the percentage similar nucleotides
printres (s,CS a ab b o) = let tot = a+ab+b+o
    in concat $ intersperse " " 
       $ s : map (percent tot) [a,ab,b,o]

printSum a b (CS ca cab cb co) = unlines (["Summary for "++a++" vs. "++b++":\n"]
   ++ columns " " [ map ("  "++) ["unmasked in both:","masked in both:"
                                 ,"masked in "++a++":","masked in "++b++":"]
                  , map show [co, cab, ca, cb]
                  , map (paren . percent tot) [co,cab,ca,cb ]])
   ++ "Pearson's phi: "++printf "%.3f" (phi :: Double)
  where tot = co+cab+ca+cb
        phi = let [a,b,ab,o] = map ((/(fromIntegral tot)) . fromIntegral) [ca,cb,cab,co]
              in (ab*o-a*b)/sqrt((ab+a)*(ab+b)*(a+o)*(b+o))

paren s = "("++s++"%)"
percent tot x = printf "%.2f" (100*fromIntegral x/fromIntegral tot :: Double)

zipComp :: [Sequence] -> [Sequence] -> [(String,CompStruct)]
zipComp as bs = map (z1 (CS 0 0 0 0)) $ zipSeqs as bs

z1 cs (a,b) = (unpack $ seqlabel a, foldl' collect cs $ zip (get a) (get b))
    where get s = map isLower $ map (s!) [0..seqlength s-1]
                 -- s/isLower/isN/ for N detect.

zipSeqs :: [Sequence] -> [Sequence] -> [(Sequence,Sequence)]
zipSeqs (a:as) (b:bs) = 
    if seqlabel a == seqlabel b then (a,b):zipSeqs as bs
    else error ("sequence sets differ (offending sequences: '" 
	     ++ unpack (seqlabel a) ++ "' and '" ++ unpack (seqlabel b) ++"'.")
zipSeqs [] [] = []
zipSeqs _ _ = error "extraneous sequences"

collect (CS a ab b o) (True,True)   = (CS a (ab+1) b o)
collect (CS a ab b o) (True,False)  = (CS (a+1) ab b o)
collect (CS a ab b o) (False,True)  = (CS a ab (b+1) o)
collect (CS a ab b o) (False,False) = (CS a ab b (o+1))

\end{code}