Calculate the contingency matrix for two clusterings.
For clusterings $C = {c_1,c_2,...c_n} and $K = {k_1, k_2,..k_m}$
the matrix is defined as $ a_ij = c_i U k_j $.

The matrix is calculated lazily, and arbitrary functions can
be applied to it to derive various scores: edit distance,
jaccard index, exact count, etc.

\begin{code}

module Main where

import qualified Data.ByteString.Lazy.Char8 as B

import Data.Map hiding (map,filter,null)
import qualified Data.Map as M
import Data.List
import Maybe
import System
import Text.Printf

type ByteString = B.ByteString

main :: IO ()
main = do
   (a:args) <- getArgs
   a' <- B.readFile a
   let ref = mkC $ readC a'
       n   = maximum $ map length ("clustering":args)
   putStrLn (pad n "clustering" ++ "\tVI\tMIF\tJaccard\tRand\tFowMal\tSkew")
   mapM_ (comp n ref) args

pad i s = s ++ replicate (i-length s) ' '

comp :: Int -> Clustering -> FilePath -> IO ()
comp n k f = do
    f' <- B.readFile f
    let mx = matrix k (readC f')
        ct = mkPairs mx
        sh = printf "%.3f"
    putStrLn $ concat $ intersperse "\t" $ (pad n f) : map sh 
            [fst $ iscore mx -- VI
            ,snd $ iscore mx -- MIF
            ,jaccard ct, rand ct, fowmal ct, skew ct] --  ++"\tmatrix: "++show ct)

type CID = Int

type Sequence     = ByteString            -- hash for efficiency?
type Cluster = [Sequence]

type Clustering   = (Map CID Cluster
                    ,Map Sequence CID)  -- optimization option

cid :: Clustering -> Sequence -> Maybe CID
cid k x = M.lookup x (snd k)

seqs :: Clustering -> CID -> Maybe Cluster
seqs k c = M.lookup c (fst k)

-- read a clustering from file
readC :: ByteString -> [Cluster]
readC = filter (not.null) . map B.words . B.lines

-- build the Clustering from a set of clusters
mkC :: [Cluster] -> Clustering
mkC  = foldr ins (M.empty,M.empty) . zip [0..] 
  where ins (n,ss) (i2c,s2i) = (M.insert n ss i2c, foldr (\s -> M.insert s n) s2i ss)

\end{code}

Matrix construction.

\begin{code}

type Matrix = Map Sequence (Maybe CID,Maybe CID)

-- reads a clustering and produces the confusion (-tingency) matrix
-- the reference is in colums (snd index), the new clustering is in rows (fst idx)
matrix :: Clustering -> [[Sequence]] -> Matrix
matrix c = foldr addRow (fmap (\x -> (Nothing,Just x)) $ snd c) . zip [0..]

addRow :: (Int,[Sequence]) -> Matrix -> Matrix
addRow (n,(s:ss)) m = case M.lookup s m of 
                         Just (Nothing,x) -> addRow (n,ss) $ M.insert s (x,Just n) m
                         Nothing -> addRow (n,ss) $ M.insert s (Just n,Nothing) m
                         _ -> error ("sequence "++show s++" occurs multiple times!")
addRow (n,[]) m = m

data MxCount = MxC { cell :: Map (CID,CID) Int -- 
                   , row  :: Map CID Int
                   , col  :: Map CID Int
                   , ir_row, ir_col :: Int } deriving Show

emptyMxC = MxC M.empty M.empty M.empty 0 0

-- convert the matrix to (Maybe CID, Maybe CID) -> count
mx_count :: Matrix -> MxCount
mx_count mx = M.fold myins emptyMxC mx
    where myinx (Nothing, Just _) (MxC ce rw cl irr irc) = (MxC ce rw cl irr (irc+1))
          myinx (Just _, Nothing) (MxC ce rw cl irr irc) = (MxC ce rw cl (irr+1) irc)
          myins (Just x,Just y) (MxC ce rw cl irr irc) = 
              let c1 = (1+M.findWithDefault 0 (x,y) ce)
                  c2 = (1+M.findWithDefault 0 x rw)
                  c3 = (1+M.findWithDefault 0 y cl)
              in c1 `seq` c2 `seq` c3 `seq` MxC (M.insert (x,y) c1 ce) (M.insert x c2 rw) (M.insert y c3 cl) irr irc

\end{code}

\subsection{Jaccard index, and friends}

Indices based on pair counts.  This includes jaccard, rand, ...?
"skew" is based on MacNemar's test.

\begin{code}

-- | a b c, and d, and irrelevant sequences for each clustering
type Pairs = (Double,Double,Double,Double)

-- todo: one row at a time to assign score to each cluster
mkPairs :: Matrix -> Pairs
mkPairs mx = let mxc = mx_count mx
                 row' x = fromIntegral $ fromJust $ M.lookup x (row mxc)
                 col' y = fromIntegral $ fromJust $ M.lookup y (col mxc)
                 count ((x,y),ct') (a,b,c,t) = let ct = fromIntegral ct' in (a+ct*(ct-1), b+ct*(row' x-ct),c+ct*(col' y-ct),t+ct)
                 (a,b,c,tot) = foldr count (0,0,0,0) $ assocs $ cell mxc
             in (a/2,b/2,c/2,(tot*(tot-1)-a-b-c)/2)

jaccard, rand, fowmal, skew :: Pairs -> Double
jaccard (a,b,c,_) = a/(a+b+c)
rand    (a,b,c,d) = (a+d)/(a+b+c+d)
fowmal  (a,b,c,d) = a/sqrt((a+b)*(a+c))  -- Fowlkes-Mallows index
-- this is the same as fowmal (why?)!
hubert  (a,b,c,d) = (a*d - b*c)/sqrt((a+b)*(c+d)*(a+c)*(b+d)) -- Gamma stat?

skew     (a,b,c,d) = (b-c)/sqrt(b+c)/sqrt d

\end{code}

\subsection{Entropy-based measures}

The \texttt{iscore} function calculates the variation of information
between the two clustrings. Given clusterings $C$ and $K$ matrix $H$, it is
calculated as:

\(
  \sum_C H_c\log H_c + \sum_K H_k\log H_k -2\sumH_{K,C} H_{k,c} \log H_{k,c}
\)

Todo: implement \[I/H = (H_c - H_k)/H_{c,k} - 1\]

I.e. collect H_k, H_c,k, and H_c, and use it to calculate both measures.

\begin{code}

type Imx = (Double,Double,Double,Double)

-- iscore calculates mutual information fraction and variation of information
iscore :: Matrix -> (Double,Double) -- MIF and VI
iscore mx = let (n,hc,hk,hck) = iscores mx
                vi  = 1/n*(-2*hck+hc+hk)
                mif = (hc+hk-2*mlog n)/(hck-mlog n)-1
            in (vi,mif)

-- iscores - calculate n, H_c, H_k, H_c,k
iscores :: Matrix -> Imx
iscores mx' =
    let mx = mx_count mx' 
        n = fromIntegral $ sum $ elems $ row mx
        entropy = sum . map (mlog . fromIntegral) . elems 
    in (n, entropy $ row mx, entropy $ col mx, entropy $ cell mx)

mlog 0 = 0 -- error "log 0!"
mlog x = x*log x/log 2

\end{code}