Extract sequences matching a list of labels \begin{code} module Main where import Data.Set hiding (null,filter) import System import Util(foldl') import IO usage :: String usage = "xcerpt \n\n" ++ "where is the name of a file containing the IDs\n" ++ "of sequences to extract from the file.\n" ++ "The result ends up in .match and .rest" main :: IO () main = do args <- System.getArgs if length args /= 2 then putStrLn usage else do d <- readFile (args!!0) let dict = mkdict d xcerpt dict (args!!1) mkdict :: String -> Set String mkdict = foldl' (flip insert) empty . words xcerpt :: Set String -> String -> IO () xcerpt dict input = do m <- openFile (input++".match") WriteMode r <- openFile (input++".rest") WriteMode i <- readFile input xtr dict m r $ filter (\l->(not.null) l && head l /= '#') $ lines i xtr _ m r [] = do hClose m hClose r xtr _ _ _ [x] = error ("Odd number of lines?\n"++x) xtr d m r (l1:ls) = if head l1 == '>' then let f = if (drop 1 $ head $ words l1) `member` d then m else r in do let (sequence,rest) = break ((=='>').head) ls hPutStr f $ unlines (l1:sequence) xtr d m r rest else error ("Not a FASTA header:\n"++l1) \end{code}