-- Fasta.hs: -- read fasta files directly as ByteStrings -- should make Haskell more competitive for bioinformatics work module Bio.Sequence.Fasta (Sequence(..), SeqData, Offset, mkSeqs ,readSeqs, writeSeqs, getLines ,hReadSeqs, hWriteSeqs, hGetLines ,seqlength, seqlabel, seqheader, seqdata ,countSeqs ,B.pack, B.unpack ,(!), revcompl ,SeqType(..) ) where -- import Data.Char (isSpace) import Data.List (groupBy,intersperse) import Data.Int import System.IO import System.IO.Unsafe import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Lazy.Char8 as B import Data.ByteString.Lazy.Char8 (ByteString(..)) type Offset = Int64 type SeqData = B.ByteString data Sequence = Seq !ByteString !ByteString -- ^ header and actual sequence instance Show Sequence where show (Seq l d) = B.unpack $ B.unlines (l : splitsAt 60 d) -- somewhat uglyk.. class SeqType sd where toSeq :: sd -> sd -> Sequence fromSeq :: Sequence -> (sd,sd) instance SeqType B.ByteString where toSeq = Seq fromSeq (Seq x y) = (x,y) instance SeqType BS.ByteString where toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s]) fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks splitsAt :: Offset -> ByteString -> [ByteString] splitsAt n s = let (s1,s2) = B.splitAt n s in if B.null s2 then [s1] else s1 : splitsAt n s2 -- | Lazily read sequences from a FASTA-formatted file readSeqs :: FilePath -> IO [Sequence] readSeqs f = do ls <- getLines f return (mkSeqs ls) -- | Write sequences to a FASTA-formatted file. -- Line length is 60. writeSeqs :: FilePath -> [Sequence] -> IO () writeSeqs f ss = do h <- openFile f WriteMode hWriteSeqs h ss hClose h -- | Lazily read sequence from handle hReadSeqs :: Handle -> IO [Sequence] hReadSeqs h = do ls <- hGetLines h return (mkSeqs ls) -- | Write sequences in FASTA format to a handle. hWriteSeqs :: Handle -> [Sequence] -> IO () hWriteSeqs h = mapM_ write where write (Seq l d) = do B.hPut h l B.hPut h $ B.pack "\n" let ls = splitsAt 60 d mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls B.hPut h $ B.pack "\n" -- ByteString operations -- These aren't (or possible weren't) provided by -- the FPS library. -- lazily read lines from file getLines :: FilePath -> IO [ByteString] getLines f = do h <- openFile f ReadMode hGetLines' (hClose h) h -- lazily read lines from handle hGetLines :: Handle -> IO [ByteString] hGetLines = hGetLines' (return ()) -- add an optional handle-closing parameter hGetLines' :: IO () -> Handle -> IO [ByteString] hGetLines' c h = do l' <- BS.hGetLine h let l = B.fromChunks $ if BS.null l' then [] else [l'] e <- hIsEOF h if e then c >> return [l] else do ls <- unsafeInterleaveIO $ hGetLines' c h return (l:ls) -- | Convert a list of FASTA-formatted lines into a list of sequences. -- Lines starting with ">" initiate a new sequence. -- Comment lines start with "#" are allowed between sequences (and ignored). mkSeqs :: [ByteString] -> [Sequence] mkSeqs ls = let ss = groupBy (const (('>' /=) . B.head)) $ filter ((/='#') . B.head) $ filter (not . B.null) ls in map mkSeq ss mkSeq (l:ls) = Seq l (B.concat $ takeWhile isSeq ls) where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s mkSeq [] = error "empty input to mkSeq" {- mkSeqs [_] = error "Fasta: Last line starts with '>'?!" mkSeqs (l:ls) | B.head l == '#' = mkSeqs ls | otherwise = let seqInit x = not (B.null x) && (B.head x == '>' || B.head x == '#') (s,ss) = break seqInit ls in Seq l (B.concat $ map (B.takeWhile (not . isSpace)) s) : mkSeqs ss -} -- | Return sequence length. seqlength :: Sequence -> Offset seqlength (Seq _ bs) = B.length bs -- | Return sequence label (first word of header) seqlabel :: Sequence -> ByteString seqlabel (Seq l _) = head (B.words l) -- | Return full header. seqheader :: Sequence -> ByteString seqheader (Seq l _) = l -- | Return the sequence data. seqdata :: Sequence -> ByteString seqdata (Seq _ d) = d countSeqs :: FilePath -> IO Int countSeqs f = do ss <- B.readFile f let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss return (length hdrs) -- | Read the character at the specified position in the sequence. {-# INLINE (!) #-} (!) :: Sequence -> Offset -> Char (!) (Seq _ bs) i = B.index bs i -- | Calculate the reverse complement. -- This is only relevant for IUPAC nucleotide alphabet, -- and it leaves other characters unmodified. revcompl :: Sequence -> Sequence revcompl (Seq l bs) = (Seq l (B.reverse $ B.map compl bs)) compl :: Char -> Char compl 'A' = 'T' compl 'T' = 'A' compl 'C' = 'G' compl 'G' = 'C' compl 'a' = 't' compl 't' = 'a' compl 'c' = 'g' compl 'g' = 'c' compl x = x