RSelect - randomly select a number of sequences from a FASTA file Usage: rselect m < file.seq # prints m random sequences to stdout rselect m n < file.seq # print m of the first n seq.s (faster) \begin{code} module Main where import Data.Set import System import Random import IO import Bio.Sequence.Fasta import Unigene main :: IO () main = do as <- getArgs case as of [m,f] -> select (read m) Nothing f [m,n,f] -> select (read m) (Just (read n)) f _ -> error "Usage: rselect n [m] input.seq > output.seq" -- | Select m out of maybe n sequences from f -- Not specifying a maximum increases memory consumption select :: Integer -> Maybe Integer -> FilePath -> IO () select m mn f = do seed <- newStdGen hPutStrLn stderr ("Random seed: "++show seed) ss <- ugRead f n <- case mn of Nothing -> do hPutStrLn stderr "Counting sequences..." n' <- countSeqs f return (fromIntegral n') Just n' -> return n' let ds = if m>n then error "rselect: m is larger than n, that won't work." else if m==n then take (fromIntegral n) $ repeat 1 else if m<=n `div` 2 then diffs $ pick m $ randomRs (1,n) seed else diffs $ invert n 1 $ pick (n-m) $ randomRs (1,n) seed out = Prelude.filter (not.Prelude.null) $ choose ds ss n `seq` hPutStrLn stderr ("N = "++show n++", M = "++show m) mapM_ (\o -> putStrLn "# cluster delimiter" >> hWriteSeqs stdout o) out -- | choose elements according to a list of deltas choose :: Show a => [Integer] -> [[a]] -> [[a]] choose = choose' [] choose' :: Show a => [a] -> [Integer] -> [[a]] -> [[a]] choose' acc [] _ = [reverse acc] choose' acc (0:ds) xss = choose' acc ds xss choose' acc (d:ds) (xs:xss) = let ys = drop (fromIntegral d-1) xs in if not (Prelude.null ys) then choose' (head ys:acc) ds (tail ys:xss) else reverse acc : choose' [] ((d-fromIntegral (length xs)):ds) xss choose' _ _ [] = error "rselect: cannot choose from empty list.\nPerhaps you specified 'n' too large?" -- | convert indices to deltas diffs :: [Integer] -> [Integer] diffs xs = diffs' (0:xs) where diffs' (a:b:cs) = if b<=a then error ("diffs needs a monotonically increasing sequence"++show (take 10 (a:b:cs))) else (b-a) : diffs' (b:cs) diffs' [_] = [] -- | convert a list to the list of missing values invert :: Integer -> Integer -> [Integer] -> [Integer] invert n i (a:cs) = [i..a-1] ++ invert n (a+1) cs invert n i [] = [i..n] -- | pick m unique integers pick :: Integer -> [Integer] -> [Integer] pick = pick' empty where pick' tmp 0 _ = elems tmp pick' tmp m (i:is) = if member i tmp then pick' tmp m is else pick' (insert i tmp) (m-1) is \end{code}