\section{MaskCount} Examine each nucleotide from two data sets, and count the Ns (or Xs?). Separate into A AB B 0. Also (optionally?) count masked region sizes to generate gnuplot'able histograms. TODO: - parameter to specify lower case/N/X as masked - justify columns in output - output percentages \begin{code} module Main where import Bio.Sequence.Fasta import Bio.Sequence.HashWord (isN) import Char (isLower) import System (getArgs,exitWith,ExitCode(..)) import Data.List (foldl',intersperse,partition) import Control.Monad import TextFormat import Text.Printf main :: IO () main = do (opts,[a,b]) <- parseArgs as <- readSeqs a bs <- readSeqs b let res = zipComp as bs when (elem 'l' opts) (do putStrLn ("# fraction of masked positions: "++a++" both "++b++"none") putStrLn $ unlines $ map printres res) when (elem 's' opts) (do putStrLn "# Summary:" putStrLn $ printSum a b $ foldl' add (CS 0 0 0 0) (map snd res)) parseArgs = do as <- getArgs let (opts,files) = partition ((=='-').head) as when (length files /= 2) usage let short = concat $ map tail opts return (short,files) usage = do putStrLn "mc: compare two FASTA files for masked nucleotides" putStrLn "usage: mc [-s|-l] file_1 file_2" exitWith $ ExitFailure 1 data CompStruct = CS !Int !Int !Int !Int add (CS a ab b o) (CS a' ab' b' o') = CS (a+a') (ab+ab') (b+b') (o+o') -- construct the percentage similar nucleotides printres (s,CS a ab b o) = let tot = a+ab+b+o in concat $ intersperse " " $ s : map (percent tot) [a,ab,b,o] printSum a b (CS ca cab cb co) = unlines (["Summary for "++a++" vs. "++b++":\n"] ++ columns " " [ map (" "++) ["unmasked in both:","masked in both:" ,"masked in "++a++":","masked in "++b++":"] , map show [co, cab, ca, cb] , map (paren . percent tot) [co,cab,ca,cb ]]) ++ "Pearson's phi: "++printf "%.3f" (phi :: Double) where tot = co+cab+ca+cb phi = let [a,b,ab,o] = map ((/(fromIntegral tot)) . fromIntegral) [ca,cb,cab,co] in (ab*o-a*b)/sqrt((ab+a)*(ab+b)*(a+o)*(b+o)) paren s = "("++s++"%)" percent tot x = printf "%.2f" (100*fromIntegral x/fromIntegral tot :: Double) zipComp :: [Sequence] -> [Sequence] -> [(String,CompStruct)] zipComp as bs = map (z1 (CS 0 0 0 0)) $ zipSeqs as bs z1 cs (a,b) = (unpack $ seqlabel a, foldl' collect cs $ zip (get a) (get b)) where get s = map isLower $ map (s!) [0..seqlength s-1] -- s/isLower/isN/ for N detect. zipSeqs :: [Sequence] -> [Sequence] -> [(Sequence,Sequence)] zipSeqs (a:as) (b:bs) = if seqlabel a == seqlabel b then (a,b):zipSeqs as bs else error ("sequence sets differ (offending sequences: '" ++ unpack (seqlabel a) ++ "' and '" ++ unpack (seqlabel b) ++"'.") zipSeqs [] [] = [] zipSeqs _ _ = error "extraneous sequences" collect (CS a ab b o) (True,True) = (CS a (ab+1) b o) collect (CS a ab b o) (True,False) = (CS (a+1) ab b o) collect (CS a ab b o) (False,True) = (CS a ab (b+1) o) collect (CS a ab b o) (False,False) = (CS a ab b (o+1)) \end{code}