Calculate hash values from nucleotide words.
Actually, it packs nucleotide words into Ints, so it's a one-to-one relationship. 
Based on Keys module from RBR

\begin{code}
module Bio.Sequence.HashWord where

import Bio.Sequence.Fasta hiding ((!))
import Data.List (sort)
import Data.Char (toUpper)
import Data.Int

import qualified Data.ByteString.Lazy.Char8 as B

(!) = B.index

-- may use a state passed from the previous invocation
data HashF k = HF { hash   :: SeqData -> Offset -> Maybe k
                 , hashes  :: SeqData -> [(k,Offset)]
                 , ksort :: [k] -> [k]
                 }

-- | Adds a default "hashes" function, when "hash" is defined.
genkeys :: HashF k -> HashF k
genkeys kf = kf { hashes = gkeys }
    where gkeys s = mkkeys (hash kf $ s)  [0..B.length s-1]
          mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps
                                        Just k -> (k,p) : mkkeys f ps
          mkkeys _ [] = []

\end{code}

\subsection{Generating hashes}

Contigous hashes.

\begin{code}

-- | Contigous constructs an int/eger from a contigous k-word.
contigous :: Integral k => Int -> HashF k
contigous k' = let 
   k = fromIntegral k'

   c_key s i | k+i > B.length s = Nothing 
             | otherwise = let s' = B.take k $ B.drop i s 
                           in if B.find isN s' == Nothing then Just (n2k k' s')
                              else Nothing
   c_keys s = c_keys' 0 s
   c_keys' c s | k > B.length s = []
               | otherwise = case B.findIndex isN w0 of 
                               Nothing -> let cur = n2k (fromIntegral k) s
                                          in (cur,c) : c_keys'' cur c (B.drop k s)
                               Just i  -> c_keys' (c+i+1) (B.drop (i+1) s)
            where w0 = B.take k s

   c_keys'' cur p s | B.null s = []
                    | isN (B.head s) = c_keys' (p+k+1) (B.tail s)
                    | otherwise = let new = (cur `mod` 4^(k-1))*4+val (B.head s)
                                  in (new,p+1) : c_keys'' new (p+1) (B.tail s)

              in HF { hash = c_key
                    , hashes  = c_keys
                    , ksort = Data.List.sort
                    }
\end{code}

\begin{code}

-- | Like 'contigous', but returns the same hash for a word and its reverse complement.
rcontig :: Integral k => Int -> HashF k
rcontig k' = let
   k = fromIntegral k'

   c_key s i | k+i > B.length s = Nothing 
             | B.find isN s' /= Nothing = Nothing
             | otherwise = Just $ min (n2k k' s') (n2k k' rs')
             where s' = B.take k $ B.drop i s
                   rs' = B.reverse $ B.map complement s'

   c_keys s = c_keys' 0 s
   c_keys' c s | k > B.length s = []
               | otherwise = case B.findIndex isN w0 of 
                               Nothing -> let c1 = n2k (fromIntegral k) w0
                                              c2 = n2k (fromIntegral k) w0r
                                          in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s)
                               Just i  -> c_keys' (c+i+1) (B.drop (i+1) s)
            where w0  = B.take k s
                  w0r = B.reverse $ B.map complement w0

   c_keys'' (c1,c2) p s | B.null s = []
                        | isN (B.head s) = c_keys' (p+k+1) (B.tail s)
                        | otherwise = let n1 = (c1 `mod` 4^(k-1))*4+val (B.head s)
                                          n2 = c2 `div` 4+4^(k-1)*(val . complement) (B.head s)
                                      in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s)

              in HF { hash = c_key
                    , hashes  = c_keys
                    , ksort = Data.List.sort
                    }

{-
rcontig k' = let 
   k = fromIntegral k'
   c_key s i | k+i > B.length s            = Nothing
             | B.find isN w0 == Nothing    = Nothing
             | otherwise                    = Just $ min (n2k k' w0) (n2k k' w0r)
       where w0 = B.take k (B.drop i s)
             w0r = B.map complement $ reverse w0

   c_keys s = c_key1 0 s

   c_key1 p s | p+k > B.length s  = []
              | (not . null . filter isN) w0 = c_key1' (n2k k' w0, n2k k' w0r) p s
              | otherwise = c_key1 (p+1) s  -- fixme: optimize
       where w0 = B.take k (B.drop p s)
             w0r = B.map complement $ reverse w0

   c_key1' (i1,i2) p s 
       | p+k+1 > B.length s = [(min i1 i2,p)]
       | otherwise = if isN (s!(p+k)) then (min i1 i2,p) : c_key1 (p+k+1) s
                     else let i' = ((i1 `mod` 4^(k-1))*4+val (s!(p+k)), i2 `div` 4+4^(k-1)*(val . complement) (s!(p+k)))
                          in (min i1 i2,p) : c_key1' i' (p+1) s

              in HF { hash = c_key
                    , hashes  = c_keys
                    , ksort = Data.List.sort

                    }
-}
\end{code}

Shape uses a gapped shape specified via a string
Note that this will be challenging to get symmetric, since
an 'N' character may appear in the mask in only one direction.

\begin{code}

type Shape = String
gapped :: Integral k => Shape -> HashF k
gapped = error "gapped"

\end{code}

\subsection{Key arithmetic}

A hash is an integer representing packed k-words.

This is currently limited to the nucleotide alphabet, but could easily(?)
be extended to arbitrary alphabets.  Ideally, it should also handle
wildcards (i.e. inserting a position with multiple hashes)

\begin{code}

isN :: Char -> Bool
isN = not . flip elem "ACGT" . toUpper

-- conversion between integers (hashes) and strings
-- n2k ignores contents beyond k chars
n2k :: Integral k => Int -> SeqData -> k
n2k k = n2i' 0 . B.take (fromIntegral k)

n2i' :: (Num a) => a -> SeqData -> a
n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs)

-- inefficient, but not used much in critical code anyway
k2n :: Integral k => Int -> k -> SeqData
k2n k = B.pack . k2n' k

k2n' k i = if k==1 then [unval i] 
           else let (q,r) = i `divMod` (4^(k-1)) in unval q : k2n' (k-1) r

{-
shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k
shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str
shiftLeft = error "Keys.shiftLeft not implemented"
-}

val :: (Num t) => Char -> t
val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; 
                           _ -> error ("val: illegal character" ++ show g)}

unval :: (Num a) => a -> Char
unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; 
                          _ -> error ("unval: illegal value "++show i)}

complement :: Char -> Char
complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A';
                          'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; 
                          _ -> error ("Can't complement "++show g)}

\end{code}