RSelect - randomly select a number of sequences from a FASTA file
Usage: rselect m < file.seq # prints m random sequences to stdout
rselect m n < file.seq # print m of the first n seq.s (faster)
\begin{code}
module Main where
import Data.Set
import System
import Random
import IO
import Bio.Sequence.Fasta
import Unigene
main :: IO ()
main = do
as <- getArgs
case as of [m,f] -> select (read m) Nothing f
[m,n,f] -> select (read m) (Just (read n)) f
_ -> error "Usage: rselect n [m] input.seq > output.seq"
-- | Select m out of maybe n sequences from f
-- Not specifying a maximum increases memory consumption
select :: Integer -> Maybe Integer -> FilePath -> IO ()
select m mn f = do
seed <- newStdGen
hPutStrLn stderr ("Random seed: "++show seed)
ss <- ugRead f
n <- case mn of Nothing -> do hPutStrLn stderr "Counting sequences..."
n' <- countSeqs f
return (fromIntegral n')
Just n' -> return n'
let ds = if m>n then error "rselect: m is larger than n, that won't work."
else if m==n then take (fromIntegral n) $ repeat 1
else if m<=n `div` 2 then diffs $ pick m $ randomRs (1,n) seed
else diffs $ invert n 1 $ pick (n-m) $ randomRs (1,n) seed
out = Prelude.filter (not.Prelude.null) $ choose ds ss
n `seq` hPutStrLn stderr ("N = "++show n++", M = "++show m)
mapM_ (\o -> putStrLn "# cluster delimiter" >> hWriteSeqs stdout o) out
-- | choose elements according to a list of deltas
choose :: Show a => [Integer] -> [[a]] -> [[a]]
choose = choose' []
choose' :: Show a => [a] -> [Integer] -> [[a]] -> [[a]]
choose' acc [] _ = [reverse acc]
choose' acc (0:ds) xss = choose' acc ds xss
choose' acc (d:ds) (xs:xss) = let ys = drop (fromIntegral d-1) xs
in if not (Prelude.null ys) then choose' (head ys:acc) ds (tail ys:xss)
else reverse acc : choose' [] ((d-fromIntegral (length xs)):ds) xss
choose' _ _ [] = error "rselect: cannot choose from empty list.\nPerhaps you specified 'n' too large?"
-- | convert indices to deltas
diffs :: [Integer] -> [Integer]
diffs xs = diffs' (0:xs)
where
diffs' (a:b:cs) = if b<=a then error ("diffs needs a monotonically increasing sequence"++show (take 10 (a:b:cs)))
else (b-a) : diffs' (b:cs)
diffs' [_] = []
-- | convert a list to the list of missing values
invert :: Integer -> Integer -> [Integer] -> [Integer]
invert n i (a:cs) = [i..a-1] ++ invert n (a+1) cs
invert n i [] = [i..n]
-- | pick m unique integers
pick :: Integer -> [Integer] -> [Integer]
pick = pick' empty
where pick' tmp 0 _ = elems tmp
pick' tmp m (i:is) = if member i tmp then pick' tmp m is
else pick' (insert i tmp) (m-1) is
\end{code}