% $Id: Fasta.lhs 547 2004-01-26 12:33:16Z ketil $ \documentclass[a4paper]{article} \pagestyle{myheadings} \usepackage{haskell} \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Reading EST Data} Parses ESTs on FASTA format, and creates corresponding EST data structures. \begin{code} module Fasta (parse, bmfparser, ugparser, FHParser,readDataLower) where import EST (EST(..), End(..), mkEST) import Gene -- data structure and revcompl import Char(isSpace) readUfile :: FilePath -> IO [EST] readUfile fn = do inp <- readFile fn return (parse ugparser inp) \end{code} \subsection {Parsing} The FHParser (FASTA header parser) contains functions for extracting the accession number (or whatever unique identifier you wish, really) and clone end from the header. \begin{code} data FHParser = FHP {getSeqNo :: String -> String ,getCloneEnd :: String -> End ,getData :: String -> [Gene] } ugparser, bmfparser :: FHParser ugparser = FHP (takeWhile (/=' ') . drop 1) (\s -> case dropIncl "clone_end=" s of ('3':_) -> Three' ('5':_) -> Five' _ -> Unknown) readData bmfparser = FHP (takeWhile (/=' ') . drop 1) (\s -> case dropIncl "cDNA " s of ('3':_) -> Three' ('5':_) -> Five' _ -> Unknown) readDataLower -- parses a list of clusters, returning the corresponding ESTs -- fixme: this can cause a stack overflow in some cases -- why? parse :: FHParser -> String -> [EST] parse p inp = zipWith (readEST p) [1..] ((ests . filter (\x->head x/='#') . filter (/="") . lines) inp) \end{code} \subsection{Processing} The functions that actually do the work. \begin{code} -- take a list of lines, partition it into a list of ESTs ests :: [String] -> [[String]] ests [] = [] ests inp = let (xs,rest) = splitWhen '>' inp in xs : ests rest -- split a list of lines when a line starts with x splitWhen :: Char -> [String] -> ([String],[String]) splitWhen _ [] = error "splitWhen can't split an empty list" splitWhen x (l:ls) | head l /= x = error ("Incorrect data format (expecting "++show x++"):"++ show l) | otherwise = (l:first, rest) where (first, rest) = mysplit [] ls mysplit acc [] = (reverse acc, []) mysplit acc ("":ms) = mysplit acc ms mysplit acc (m:ms) = if head m == x then (reverse acc, m:ms) else mysplit (m:acc) ms -- use a parser to extract information, and produce ESTs in both directions readEST :: FHParser -> Int -> [String] -> EST readEST _ _ [] = error "readEST cannot read empty data" readEST p i (hd:seqs) = if seqno==seqno && end==end then mkEST (i,seqno, end, (getData p) $ concat seqs) else error "Can't happen (it's just for strictness, dummy!)" where seqno = (getSeqNo p) hd end = (getCloneEnd p) hd -- readData - read sequence data from input, ignoring whitespace (\n) -- belongs in the Gene module, perhaps? -- Note that sometimes lower case means masked/clipped readDataLower, readData :: String -> [Gene] readDataLower = map read1 . filter (not.isSpace) where read1 x = case x of 'a' -> A 'c' -> C 'g' -> G 't' -> T 'A' -> A 'C' -> C 'G' -> G 'T' -> T _ -> N readData = map read1 . filter (not.isSpace) where read1 x = case x of 'A' -> A 'C' -> C 'G' -> G 'T' -> T _ -> N dropUntil :: ([a] -> Bool) -> [a] -> [a] dropUntil _ [] = [] dropUntil p xs = if p xs then xs else dropUntil p (tail xs) dropIncl :: Eq a => [a] -> [a] -> [a] dropIncl s = drop (length s) . dropUntil (==s) \end{code} %%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{document}