-- Fasta.hs: 
-- read fasta files directly as ByteStrings
-- should make Haskell more competitive for bioinformatics work

module Bio.Sequence.Fasta (Sequence(..), SeqData, Offset, mkSeqs
                          ,readSeqs, writeSeqs, getLines
                          ,hReadSeqs, hWriteSeqs, hGetLines
                          ,seqlength, seqlabel, seqheader, seqdata
                          ,countSeqs
                          ,B.pack, B.unpack
                          ,(!), revcompl
                          ,SeqType(..)
) where

-- import Data.Char (isSpace)
import Data.List (groupBy,intersperse)
import Data.Int

import System.IO
import System.IO.Unsafe

import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.Lazy.Char8 as B
import Data.ByteString.Lazy.Char8 (ByteString(..))

type Offset  = Int64
type SeqData = B.ByteString

data Sequence = Seq !ByteString !ByteString -- ^ header and actual sequence
instance Show Sequence where 
    show (Seq l d) = B.unpack $ B.unlines (l : splitsAt 60 d)

-- somewhat uglyk..
class SeqType sd where
    toSeq :: sd -> sd -> Sequence
    fromSeq :: Sequence -> (sd,sd)
instance SeqType B.ByteString where
    toSeq = Seq
    fromSeq (Seq x y) = (x,y)
instance SeqType BS.ByteString where
    toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s])
    fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks 

splitsAt :: Offset -> ByteString -> [ByteString]
splitsAt n s = let (s1,s2) = B.splitAt n s 
               in if B.null s2 then [s1] else s1 : splitsAt n s2

-- | Lazily read sequences from a FASTA-formatted file
readSeqs :: FilePath -> IO [Sequence]
readSeqs f = do
  ls <- getLines f
  return (mkSeqs ls)

-- | Write sequences to a FASTA-formatted file.
--   Line length is 60.
writeSeqs :: FilePath -> [Sequence] -> IO ()
writeSeqs f ss = do 
  h <- openFile f WriteMode
  hWriteSeqs h ss
  hClose h

-- | Lazily read sequence from handle
hReadSeqs :: Handle -> IO [Sequence]
hReadSeqs h = do
  ls <- hGetLines h
  return (mkSeqs ls)

-- | Write sequences in FASTA format to a handle.
hWriteSeqs :: Handle -> [Sequence] -> IO ()
hWriteSeqs h = mapM_ write
    where write (Seq l d) = do 
            B.hPut h l
            B.hPut h $ B.pack "\n"
            let ls = splitsAt 60 d
            mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls
            B.hPut h $ B.pack "\n"

-- ByteString operations
-- These aren't (or possible weren't) provided by
-- the FPS library.

-- lazily read lines from file
getLines :: FilePath -> IO [ByteString]
getLines f = do
  h <- openFile f ReadMode
  hGetLines' (hClose h) h

-- lazily read lines from handle
hGetLines :: Handle -> IO [ByteString]
hGetLines = hGetLines' (return ())

-- add an optional handle-closing parameter
hGetLines' :: IO () -> Handle -> IO [ByteString]
hGetLines' c h = do
  l' <- BS.hGetLine h
  let l = B.fromChunks $ if BS.null l' then [] else  [l']
  e <- hIsEOF h 
  if e then c >> return [l]
       else do ls <- unsafeInterleaveIO $ hGetLines' c h
               return (l:ls)

-- | Convert a list of FASTA-formatted lines into a list of sequences.
--   Lines starting with ">" initiate a new sequence.
--   Comment lines start with "#" are allowed between sequences (and ignored).
mkSeqs :: [ByteString] -> [Sequence]
mkSeqs ls = let ss = groupBy (const (('>' /=) . B.head))  $ filter ((/='#') . B.head) $ filter (not . B.null) ls
                in map mkSeq ss

mkSeq (l:ls) = Seq l (B.concat $ takeWhile isSeq ls)
    where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s 
mkSeq [] = error "empty input to mkSeq"

{-
mkSeqs [_] = error "Fasta: Last line starts with '>'?!"
mkSeqs (l:ls) | B.head l == '#'  = mkSeqs ls
              | otherwise = let seqInit x = not (B.null x) && (B.head x == '>' || B.head x == '#')
                                (s,ss) = break seqInit ls 
                            in Seq l (B.concat $ map (B.takeWhile (not . isSpace)) s) : mkSeqs ss
-}

-- | Return sequence length.
seqlength :: Sequence -> Offset
seqlength (Seq _ bs) = B.length bs

-- | Return sequence label (first word of header)
seqlabel :: Sequence -> ByteString
seqlabel (Seq l _) = head (B.words l)

-- | Return full header.
seqheader :: Sequence -> ByteString
seqheader (Seq l _) = l

-- | Return the sequence data.
seqdata :: Sequence -> ByteString
seqdata (Seq _ d) = d

countSeqs :: FilePath -> IO Int
countSeqs f = do
  ss <- B.readFile f
  let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss
  return (length hdrs) 

-- | Read the character at the specified position in the sequence.
{-# INLINE (!) #-}
(!) :: Sequence -> Offset -> Char
(!) (Seq _ bs) i  = B.index bs i

-- | Calculate the reverse complement.
--   This is only relevant for IUPAC nucleotide alphabet, 
--   and it leaves other characters unmodified.
revcompl :: Sequence -> Sequence
revcompl (Seq l bs) = (Seq l (B.reverse $ B.map compl bs))

compl :: Char -> Char
compl 'A' = 'T'
compl 'T' = 'A'
compl 'C' = 'G'
compl 'G' = 'C'
compl 'a' = 't'
compl 't' = 'a'
compl 'c' = 'g'
compl 'g' = 'c'
compl  x  =  x