Calculate the contingency matrix for two clusterings. For clusterings $C = {c_1,c_2,...c_n} and $K = {k_1, k_2,..k_m}$ the matrix is defined as $ a_ij = c_i U k_j $. The matrix is calculated lazily, and arbitrary functions can be applied to it to derive various scores: edit distance, jaccard index, exact count, etc. \begin{code} module Main where import qualified Data.ByteString.Lazy.Char8 as B import Data.Map hiding (map,filter,null) import qualified Data.Map as M import Data.List import Maybe import System import Text.Printf type ByteString = B.ByteString main :: IO () main = do (a:args) <- getArgs a' <- B.readFile a let ref = mkC $ readC a' n = maximum $ map length ("clustering":args) putStrLn (pad n "clustering" ++ "\tVI\tMIF\tJaccard\tRand\tFowMal\tSkew") mapM_ (comp n ref) args pad i s = s ++ replicate (i-length s) ' ' comp :: Int -> Clustering -> FilePath -> IO () comp n k f = do f' <- B.readFile f let mx = matrix k (readC f') ct = mkPairs mx sh = printf "%.3f" putStrLn $ concat $ intersperse "\t" $ (pad n f) : map sh [fst $ iscore mx -- VI ,snd $ iscore mx -- MIF ,jaccard ct, rand ct, fowmal ct, skew ct] -- ++"\tmatrix: "++show ct) type CID = Int type Sequence = ByteString -- hash for efficiency? type Cluster = [Sequence] type Clustering = (Map CID Cluster ,Map Sequence CID) -- optimization option cid :: Clustering -> Sequence -> Maybe CID cid k x = M.lookup x (snd k) seqs :: Clustering -> CID -> Maybe Cluster seqs k c = M.lookup c (fst k) -- read a clustering from file readC :: ByteString -> [Cluster] readC = filter (not.null) . map B.words . B.lines -- build the Clustering from a set of clusters mkC :: [Cluster] -> Clustering mkC = foldr ins (M.empty,M.empty) . zip [0..] where ins (n,ss) (i2c,s2i) = (M.insert n ss i2c, foldr (\s -> M.insert s n) s2i ss) \end{code} Matrix construction. \begin{code} type Matrix = Map Sequence (Maybe CID,Maybe CID) -- reads a clustering and produces the confusion (-tingency) matrix -- the reference is in colums (snd index), the new clustering is in rows (fst idx) matrix :: Clustering -> [[Sequence]] -> Matrix matrix c = foldr addRow (fmap (\x -> (Nothing,Just x)) $ snd c) . zip [0..] addRow :: (Int,[Sequence]) -> Matrix -> Matrix addRow (n,(s:ss)) m = case M.lookup s m of Just (Nothing,x) -> addRow (n,ss) $ M.insert s (x,Just n) m Nothing -> addRow (n,ss) $ M.insert s (Just n,Nothing) m _ -> error ("sequence "++show s++" occurs multiple times!") addRow (n,[]) m = m data MxCount = MxC { cell :: Map (CID,CID) Int -- , row :: Map CID Int , col :: Map CID Int , ir_row, ir_col :: Int } deriving Show emptyMxC = MxC M.empty M.empty M.empty 0 0 -- convert the matrix to (Maybe CID, Maybe CID) -> count mx_count :: Matrix -> MxCount mx_count mx = M.fold myins emptyMxC mx where myinx (Nothing, Just _) (MxC ce rw cl irr irc) = (MxC ce rw cl irr (irc+1)) myinx (Just _, Nothing) (MxC ce rw cl irr irc) = (MxC ce rw cl (irr+1) irc) myins (Just x,Just y) (MxC ce rw cl irr irc) = let c1 = (1+M.findWithDefault 0 (x,y) ce) c2 = (1+M.findWithDefault 0 x rw) c3 = (1+M.findWithDefault 0 y cl) in c1 `seq` c2 `seq` c3 `seq` MxC (M.insert (x,y) c1 ce) (M.insert x c2 rw) (M.insert y c3 cl) irr irc \end{code} \subsection{Jaccard index, and friends} Indices based on pair counts. This includes jaccard, rand, ...? "skew" is based on MacNemar's test. \begin{code} -- | a b c, and d, and irrelevant sequences for each clustering type Pairs = (Double,Double,Double,Double) -- todo: one row at a time to assign score to each cluster mkPairs :: Matrix -> Pairs mkPairs mx = let mxc = mx_count mx row' x = fromIntegral $ fromJust $ M.lookup x (row mxc) col' y = fromIntegral $ fromJust $ M.lookup y (col mxc) count ((x,y),ct') (a,b,c,t) = let ct = fromIntegral ct' in (a+ct*(ct-1), b+ct*(row' x-ct),c+ct*(col' y-ct),t+ct) (a,b,c,tot) = foldr count (0,0,0,0) $ assocs $ cell mxc in (a/2,b/2,c/2,(tot*(tot-1)-a-b-c)/2) jaccard, rand, fowmal, skew :: Pairs -> Double jaccard (a,b,c,_) = a/(a+b+c) rand (a,b,c,d) = (a+d)/(a+b+c+d) fowmal (a,b,c,d) = a/sqrt((a+b)*(a+c)) -- Fowlkes-Mallows index -- this is the same as fowmal (why?)! hubert (a,b,c,d) = (a*d - b*c)/sqrt((a+b)*(c+d)*(a+c)*(b+d)) -- Gamma stat? skew (a,b,c,d) = (b-c)/sqrt(b+c)/sqrt d \end{code} \subsection{Entropy-based measures} The \texttt{iscore} function calculates the variation of information between the two clustrings. Given clusterings $C$ and $K$ matrix $H$, it is calculated as: \( \sum_C H_c\log H_c + \sum_K H_k\log H_k -2\sumH_{K,C} H_{k,c} \log H_{k,c} \) Todo: implement \[I/H = (H_c - H_k)/H_{c,k} - 1\] I.e. collect H_k, H_c,k, and H_c, and use it to calculate both measures. \begin{code} type Imx = (Double,Double,Double,Double) -- iscore calculates mutual information fraction and variation of information iscore :: Matrix -> (Double,Double) -- MIF and VI iscore mx = let (n,hc,hk,hck) = iscores mx vi = 1/n*(-2*hck+hc+hk) mif = (hc+hk-2*mlog n)/(hck-mlog n)-1 in (vi,mif) -- iscores - calculate n, H_c, H_k, H_c,k iscores :: Matrix -> Imx iscores mx' = let mx = mx_count mx' n = fromIntegral $ sum $ elems $ row mx entropy = sum . map (mlog . fromIntegral) . elems in (n, entropy $ row mx, entropy $ col mx, entropy $ cell mx) mlog 0 = 0 -- error "log 0!" mlog x = x*log x/log 2 \end{code}