Cluster all ESTs against the genome

Construct the de-Bruijn graph for each cluster
(i.e. collect the set of all k-words in each cluster)
What about multiplicity?

For each pair of clusters, calculate the intersection.
Repeat words = U (Ci /\ Cj)  -- also keep count of each word.

(Can we use a more sophisticated data structure?  I.e. keep track of
longest common exact string?)

Check each repeat word against: repbase, simple repeats, complexity

Parameters: word length k, and file to read clusters from

\begin{code}

module Main where

import qualified Data.Set as S
import qualified Data.Map as M

import qualified Data.ByteString.Lazy.Char8 as B
import Bio.Sequence.Fasta
import Bio.Sequence.HashWord

import Unigene

import System
import Maybe
import Util
import Data.List

-- type Word = Int

main = do
  args <- System.getArgs
  (k,rs,rl) <- initialize args
  if (k>16) 
     then putStrLn (myshow k rl)
     else putStrLn (myshow k rs)

myshow k = unlines . map show1 
    where show1 (key,count) = ">" ++ show key ++ " " ++ show count ++ "\n" ++ B.unpack (k2n k key) ++ "\n"

initialize args = do
  let k = read (head args)
  case tail args of [csfile,sqsfile] -> do
                      cs <- {- countIO 10 "clusters: " . -} return . filter (not.null) . map words . lines =<< readFile csfile
                      sqs <- readSeqs sqsfile
                      let rs = repeats k $ clusters cs sqs :: [(Int,Int)]
                          rl = repeats k $ clusters cs sqs :: [(Integer,Int)]
                      return (k,rs,rl)
                    [ugfile] -> do
                      ugdata <- {- countIO 10 "clusters: " =<< -} ugRead ugfile
                      let rs = repeats k ugdata :: [(Int,Int)]
                          rl = repeats k ugdata :: [(Integer,Int)]
                      return (k,rs,rl)

-- build clusters from [[Label]] and [Seq Label sdata]
clusters :: [[String]] -> [Sequence] -> [[Sequence]]
clusters labels seqs = map (map mylookup) labels
    where smap = M.fromListWith (error "duplicate sequences in input!") $ 
                 map (\s@(Seq l _) -> (B.dropWhile (=='>') l,s)) seqs
          mylookup s = case (flip M.lookup $ smap) . pack $ s of 
                         Nothing -> error ("sequence '"++s++"' in the clustering is not found in data set")
                         Just x  -> x

-- extract words from clusters
debruijn :: Integral w => Int -> [Sequence] -> S.Set w
debruijn k = foldl1' S.union . map (S.fromList . keys k . seqdata)
-- slightly faster than: "foldl' (flip S.insert) S.empty . concatMap (keys k)"

keys k = map fst . hashes (rcontig k)

-- calculate word counts
freqs :: Integral w => [S.Set w] -> M.Map w Int
freqs = foldl' union M.empty
    where union :: Integral w => M.Map w Int -> S.Set w -> M.Map w Int
          union a b = foldl' insert a $ S.toList b
          insert a k = case M.lookup k a of
                         Just x  -> let v' = x+1 in v' `seq` M.insert k v' a
                         Nothing -> M.insert k 1 a

-- inefficient?
toMap :: Integral w => S.Set w -> M.Map w Int
toMap = M.fromList . map (\x -> (x,1)) . S.toList

-- given word length k, calculate repeats from clusters
repeats :: Integral w => Int -> [[Sequence]] -> [(w,Int)]
repeats k = filter ((>1).snd) . M.toList . freqs . map (debruijn k)

\end{code}