Cluster all ESTs against the genome Construct the de-Bruijn graph for each cluster (i.e. collect the set of all k-words in each cluster) What about multiplicity? For each pair of clusters, calculate the intersection. Repeat words = U (Ci /\ Cj) -- also keep count of each word. (Can we use a more sophisticated data structure? I.e. keep track of longest common exact string?) Check each repeat word against: repbase, simple repeats, complexity Parameters: word length k, and file to read clusters from \begin{code} module Main where import qualified Data.Set as S import qualified Data.Map as M import qualified Data.ByteString.Lazy.Char8 as B import Bio.Sequence.Fasta import Bio.Sequence.HashWord import Unigene import System import Maybe import Util import Data.List -- type Word = Int main = do args <- System.getArgs (k,rs,rl) <- initialize args if (k>16) then putStrLn (myshow k rl) else putStrLn (myshow k rs) myshow k = unlines . map show1 where show1 (key,count) = ">" ++ show key ++ " " ++ show count ++ "\n" ++ B.unpack (k2n k key) ++ "\n" initialize args = do let k = read (head args) case tail args of [csfile,sqsfile] -> do cs <- {- countIO 10 "clusters: " . -} return . filter (not.null) . map words . lines =<< readFile csfile sqs <- readSeqs sqsfile let rs = repeats k $ clusters cs sqs :: [(Int,Int)] rl = repeats k $ clusters cs sqs :: [(Integer,Int)] return (k,rs,rl) [ugfile] -> do ugdata <- {- countIO 10 "clusters: " =<< -} ugRead ugfile let rs = repeats k ugdata :: [(Int,Int)] rl = repeats k ugdata :: [(Integer,Int)] return (k,rs,rl) -- build clusters from [[Label]] and [Seq Label sdata] clusters :: [[String]] -> [Sequence] -> [[Sequence]] clusters labels seqs = map (map mylookup) labels where smap = M.fromListWith (error "duplicate sequences in input!") $ map (\s@(Seq l _) -> (B.dropWhile (=='>') l,s)) seqs mylookup s = case (flip M.lookup $ smap) . pack $ s of Nothing -> error ("sequence '"++s++"' in the clustering is not found in data set") Just x -> x -- extract words from clusters debruijn :: Integral w => Int -> [Sequence] -> S.Set w debruijn k = foldl1' S.union . map (S.fromList . keys k . seqdata) -- slightly faster than: "foldl' (flip S.insert) S.empty . concatMap (keys k)" keys k = map fst . hashes (rcontig k) -- calculate word counts freqs :: Integral w => [S.Set w] -> M.Map w Int freqs = foldl' union M.empty where union :: Integral w => M.Map w Int -> S.Set w -> M.Map w Int union a b = foldl' insert a $ S.toList b insert a k = case M.lookup k a of Just x -> let v' = x+1 in v' `seq` M.insert k v' a Nothing -> M.insert k 1 a -- inefficient? toMap :: Integral w => S.Set w -> M.Map w Int toMap = M.fromList . map (\x -> (x,1)) . S.toList -- given word length k, calculate repeats from clusters repeats :: Integral w => Int -> [[Sequence]] -> [(w,Int)] repeats k = filter ((>1).snd) . M.toList . freqs . map (debruijn k) \end{code}