\documentclass[a4paper]{article} \pagestyle{myheadings} \usepackage{haskell} \begin{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Clustering based on scored pairs} Given a list of scored pairs, build a hierarchical clustering by inserting pairs incrementally. Clustering is performed in two stages \begin{itemize} \item Stage one builds a transitive closure, keeping track of ESTs and the detected SMPairs \item Stage two, when necessary, builds a hierarchy from each cluster \end{itemize} \begin{code} module Cluster (Cluster,cluster_single,cluster_simple,cluster_all ,labels,newick,unigene) where import Indexed import EST(EST(..),mkshowlist,mkshowlist',label) import Pairs(SMPair, Pair(..),direction,elim_bs) import Data.Set hiding (insert) import qualified Data.Set import List(intersperse,sortBy) import qualified List as L import Util(foldl',_log) type Cluster = (Set EST,[SMPair]) -- | cluster builds a single linkage transitive closure cluster_single, cluster_simple, cluster_all :: Bool -> [EST] -> Int -> [SMPair] -> [Cluster] cluster_single a_s = cluster a_s ug_merge cluster_simple a_s = cluster a_s simple_merge cluster_all a_s es i = cluster a_s all_merge es i . L.map elim_bs -- cluster_simple i = cluster simple_merge i . map elim_bs -- cluster_all i = cluster all_merge i . map elim_bs cluster a_s mf es t ps = let cs = foldl' (insert mf) [] (L.filter (\x->score x>=t) ps) in if a_s then add_singletons es cs else cs -- add all sequences not in the clusters as singleton clusters add_singletons es cs = let cseqs = foldl' union empty (L.map fst cs) eseqs = foldl' (flip Data.Set.insert) empty es sseqs = eseqs `difference` cseqs -- was: minusSet singleton e = (fromList [e],[]) in cs ++ L.map singleton (toList sseqs) -- keep all pairs, gives an acyclic graph -- i.e. the tree all_merge p ps = p:ps -- keep smallest pair, used by (old) unigene behavior ug_merge p [] = [p] ug_merge p ps = [head $ sortBy (\x y->compare (score x) (score y)) (p:ps)] -- keep nothing, useful if you only want the set of ESTs in each cluster simple_merge _ _ = [] data Refers a = None [a] | Both a [a] | One a [a] | Two a a [a] -- inserts an SMPair into a list of Clusters insert :: (SMPair -> [SMPair] -> [SMPair]) -> [Cluster] -> SMPair -> [Cluster] insert mergefn cs p = let (e0,e1) = ests p in case references cs p of -- refs ESTs not in any cluster, create a new cluster None cc -> if e0 == e1 then error "cc -- ignore self-refs" else (fromList [e0,e1], mergefn p []) : cc -- refs ESTs both in same cluster, just collect the pair Both (es,ps) cc -> (es, mergefn p ps) : cc -- one EST in a cluster, one without One (es,ps) cc -> (Data.Set.insert e es, mergefn p ps) : cc where e = if e0 `member` es then e1 else e0 -- ESTs in different clusters, join them Two (e0s,p0s) (e1s,p1s) cc -> (union e0s e1s, mergefn p (p0s++p1s)) : cc -- helper for insert references :: [(Set EST,a)] -> SMPair -> Refers (Set EST,a) references cs p = let (e0,e1) = ests p in case refs [] cs e0 of Nothing -> case refs [] cs e1 of Nothing -> None cs Just (c,cc) -> One c cc Just (c,cc) -> case refs [] [c] e1 of Just (_c',_) -> Both c cc Nothing -> case refs [] cc e1 of Just (c',cc') -> Two c c' cc' Nothing -> One c cc refs _ [] _ = Nothing refs acc ((es,ps):cs) e | e `member` es = Just ((es,ps), reverse acc ++ cs) | otherwise = refs ((es,ps):acc) cs e \end{code} \newpage \subsection{Generating Simple Output} Output the labels of the clusters, one cluster on each line. \begin{code} labels :: Cluster -> String labels (ests,_ps) = (concat $ List.intersperse " " $ L.map label (toList ests)) \end{code} \subsection{Producing Newick-formatted Trees} Two versions outputting a tree graph in {\em Newick} format. The first version produces a tree where branch lengths indicate match quality and has complete labels. The second indicates match quality by the {\em level} of the branch, leading to a prettier tree with aligned labels. The {\tt newick2} function is probably the only useful one, and exported through {\tt newick}. \begin{code} data CTree = Branch SMPair CTree CTree | Leaf EST newick :: Cluster -> String newick c = (newick2 $ hierarchy c)++";" -- | hierarchy constructs the corresponding tree from a cluster -- (note the reverse sorting) hierarchy :: Cluster -> CTree hierarchy (es,ps) = if size es == 1 then (Leaf (unit $ toList es)) else (snd . unit . foldl' h_add [] . sortBy (\y x->compare (score x) (score y))) ps where unit [x] = x unit _ = error "Unit: illegal list size" -- this is similar to 'insert' above h_add :: [(Set EST,CTree)] -> SMPair -> [(Set EST,CTree)] h_add cs p = let (e0,e1) = ests p in case cs `references` p of None cc -> (fromList [e0,e1], Branch p (Leaf e0) (Leaf e1)) : cc Both c cc -> (c:cc) -- ignore, the pair's redundant One (es,br) cc -> (Data.Set.insert e es, Branch p (Leaf e) br) : cc where e = if e0 `member` es then e1 else e0 Two (e0,c0) (e1,c1) cc -> (union e0 e1, Branch p c0 c1) : cc -- build Newick-formatted trees from a hierarchical clustering newick1, newick2 :: CTree -> String newick1 (Leaf e) = label e newick1 (Branch p left right) = "(" ++ newick1 left ++ depth ++ ", " ++ newick1 right ++ depth ++ ")" where depth = ':':show (1000/fromIntegral (score p))-- from about 1 to 66 (k=15) -- Adjust so that labels are right-aligned. Branch height indicate similarity -- This looks more traditional, and is probably better. newick2 (Leaf e) = label e newick2 c@(Branch _ left right) = "(" ++ newick2 left ++ ':':show (depth c - depth left) ++ ", " ++ newick2 right ++ ':':show (depth c - depth right) ++ ")" where depth (Leaf _) = 0 depth (Branch p _ _) = 1000.0 / fromIntegral (score p) \end{code} \subsection{Outputting UniGene-style clusters} Unigene clusters consists of a header starting with a hash mark (\#), followed by the sequences in FASTA format. We first generate the hierarchical clustering (as in newick), and calculates a consistent set of directed sequences with a neat recursion over the tree. The final (printed) direction is thus arbitrary, but consistent. \begin{code} -- set of forward ESTs and set of reversed ESTs type DPair = (Set EST,Set EST) unigene :: Cluster -> String unigene (es,ps) = "# Cluster consisting of " ++ (if L.null ps then "one sequence\n" ++ concatMap (showseq True) (toList es) else (show . size) es ++ " sequences," ++ " lowest score is " ++ (show . minimum . L.map score) ps ++ "\n" ++ concatMap (showseq True) (toList fwd) ++ concatMap (showseq False) (toList rev)) where (fwd,rev) = resolve $ hierarchy (es,ps) resolve :: CTree -> DPair resolve (Leaf e) = (fromList [e],empty) resolve (Branch p l r) = let (fwd1,rev1) = resolve l (fwd2,rev2) = resolve r (e0,e1) = ests p d = direction p e0f1 = e0 `member` fwd1 e0f2 = e0 `member` fwd2 e0r1 = e0 `member` rev1 e0r2 = e0 `member` rev2 e1f1 = e1 `member` fwd1 e1f2 = e1 `member` fwd2 e1r1 = e1 `member` rev1 e1r2 = e1 `member` rev2 codir = e0f1 && e1f2 || e0f2 && e1f1 || e0r1 && e1r2 || e1r1 && e0r2 in if codir && d || not codir && not d then (union fwd1 fwd2, union rev1 rev2) else (union fwd1 rev2, union fwd2 rev1) showseq :: Bool -> EST -> String showseq d e = ">" ++ label e ++ if d==False -- i.e. reverse then " (rev)\n" ++ mkshowlist' (0,len e) e ++"\n" else " (fwd)\n" ++ mkshowlist (0,len e) e ++"\n" \end{code} \end{document} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%