Calculate hash values from nucleotide words. Actually, it packs nucleotide words into Ints, so it's a one-to-one relationship. Based on Keys module from RBR \begin{code} module Bio.Sequence.HashWord where import Bio.Sequence.Fasta hiding ((!)) import Data.List (sort) import Data.Char (toUpper) import Data.Int import qualified Data.ByteString.Lazy.Char8 as B (!) = B.index -- may use a state passed from the previous invocation data HashF k = HF { hash :: SeqData -> Offset -> Maybe k , hashes :: SeqData -> [(k,Offset)] , ksort :: [k] -> [k] } -- | Adds a default "hashes" function, when "hash" is defined. genkeys :: HashF k -> HashF k genkeys kf = kf { hashes = gkeys } where gkeys s = mkkeys (hash kf $ s) [0..B.length s-1] mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps Just k -> (k,p) : mkkeys f ps mkkeys _ [] = [] \end{code} \subsection{Generating hashes} Contigous hashes. \begin{code} -- | Contigous constructs an int/eger from a contigous k-word. contigous :: Integral k => Int -> HashF k contigous k' = let k = fromIntegral k' c_key s i | k+i > B.length s = Nothing | otherwise = let s' = B.take k $ B.drop i s in if B.find isN s' == Nothing then Just (n2k k' s') else Nothing c_keys s = c_keys' 0 s c_keys' c s | k > B.length s = [] | otherwise = case B.findIndex isN w0 of Nothing -> let cur = n2k (fromIntegral k) s in (cur,c) : c_keys'' cur c (B.drop k s) Just i -> c_keys' (c+i+1) (B.drop (i+1) s) where w0 = B.take k s c_keys'' cur p s | B.null s = [] | isN (B.head s) = c_keys' (p+k+1) (B.tail s) | otherwise = let new = (cur `mod` 4^(k-1))*4+val (B.head s) in (new,p+1) : c_keys'' new (p+1) (B.tail s) in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort } \end{code} \begin{code} -- | Like 'contigous', but returns the same hash for a word and its reverse complement. rcontig :: Integral k => Int -> HashF k rcontig k' = let k = fromIntegral k' c_key s i | k+i > B.length s = Nothing | B.find isN s' /= Nothing = Nothing | otherwise = Just $ min (n2k k' s') (n2k k' rs') where s' = B.take k $ B.drop i s rs' = B.reverse $ B.map complement s' c_keys s = c_keys' 0 s c_keys' c s | k > B.length s = [] | otherwise = case B.findIndex isN w0 of Nothing -> let c1 = n2k (fromIntegral k) w0 c2 = n2k (fromIntegral k) w0r in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s) Just i -> c_keys' (c+i+1) (B.drop (i+1) s) where w0 = B.take k s w0r = B.reverse $ B.map complement w0 c_keys'' (c1,c2) p s | B.null s = [] | isN (B.head s) = c_keys' (p+k+1) (B.tail s) | otherwise = let n1 = (c1 `mod` 4^(k-1))*4+val (B.head s) n2 = c2 `div` 4+4^(k-1)*(val . complement) (B.head s) in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s) in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort } {- rcontig k' = let k = fromIntegral k' c_key s i | k+i > B.length s = Nothing | B.find isN w0 == Nothing = Nothing | otherwise = Just $ min (n2k k' w0) (n2k k' w0r) where w0 = B.take k (B.drop i s) w0r = B.map complement $ reverse w0 c_keys s = c_key1 0 s c_key1 p s | p+k > B.length s = [] | (not . null . filter isN) w0 = c_key1' (n2k k' w0, n2k k' w0r) p s | otherwise = c_key1 (p+1) s -- fixme: optimize where w0 = B.take k (B.drop p s) w0r = B.map complement $ reverse w0 c_key1' (i1,i2) p s | p+k+1 > B.length s = [(min i1 i2,p)] | otherwise = if isN (s!(p+k)) then (min i1 i2,p) : c_key1 (p+k+1) s else let i' = ((i1 `mod` 4^(k-1))*4+val (s!(p+k)), i2 `div` 4+4^(k-1)*(val . complement) (s!(p+k))) in (min i1 i2,p) : c_key1' i' (p+1) s in HF { hash = c_key , hashes = c_keys , ksort = Data.List.sort } -} \end{code} Shape uses a gapped shape specified via a string Note that this will be challenging to get symmetric, since an 'N' character may appear in the mask in only one direction. \begin{code} type Shape = String gapped :: Integral k => Shape -> HashF k gapped = error "gapped" \end{code} \subsection{Key arithmetic} A hash is an integer representing packed k-words. This is currently limited to the nucleotide alphabet, but could easily(?) be extended to arbitrary alphabets. Ideally, it should also handle wildcards (i.e. inserting a position with multiple hashes) \begin{code} isN :: Char -> Bool isN = not . flip elem "ACGT" . toUpper -- conversion between integers (hashes) and strings -- n2k ignores contents beyond k chars n2k :: Integral k => Int -> SeqData -> k n2k k = n2i' 0 . B.take (fromIntegral k) n2i' :: (Num a) => a -> SeqData -> a n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs) -- inefficient, but not used much in critical code anyway k2n :: Integral k => Int -> k -> SeqData k2n k = B.pack . k2n' k k2n' k i = if k==1 then [unval i] else let (q,r) = i `divMod` (4^(k-1)) in unval q : k2n' (k-1) r {- shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str shiftLeft = error "Keys.shiftLeft not implemented" -} val :: (Num t) => Char -> t val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; _ -> error ("val: illegal character" ++ show g)} unval :: (Num a) => a -> Char unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; _ -> error ("unval: illegal value "++show i)} complement :: Char -> Char complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A'; 'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; _ -> error ("Can't complement "++show g)} \end{code}