Technical details MDL algorithm

CYCLE.sh

#!/bin/sh

#ulimit -v 100000
#ulimit -s 100000

rm greedyoutput
touch greedyoutput
cp $* sequences

size="`wc -l sequences | awk '{print $1}'`"

while [ "$size" -ge 1  ] ; 
do  
	RUNPRATT.sh      # pratt fasta sequences < s
	GREEDY.sh |tee >>greedyoutput
	rm -rf $size
	mkdir $size
	mv sequences.*.pat $size
	size="`wc -l sequences | awk '{print $1}'`"
done

GREEDY.sh

#!/bin/sh

size="`wc -l prattoutput | awk '{print $1}'`"

if [ "$size" -ge 1  ] ; then
	cat prattoutput |
	sort -nr |
	gawk '
	BEGIN { file="sequences"; newfile="newsequences"; OUTPUT=0; }
	NR==1 {
	   for( i=4 ; i<=NF ;i++ )
	   { 
	     REMOVE[$i] = 1;
	   }
	
	   SEQ= -1;
	   while( getline line < file > 0 )
	   {
		if ( line ~ /^>/ ) 
		{
			SEQ++;
			if( REMOVE[SEQ] ) printf line " " ;
		}
		if ( !REMOVE[SEQ] ) { print line >newfile; OUTPUT=1; }
	   }
	   print $1,$2,$3;	
	}
	END { 
		if(OUTPUT) system( "mv " newfile " " file ); 
		else { system( "rm " file ); printf "" > file ; }		
	}
	'
else
 	cat sequences 
 	rm sequences
 	touch sequences
fi

RUNPRATT.sh

#!/bin/sh

size="`grep '>' sequences | wc -l | awk '{print $1}'`"

rm prattoutput

gawk '
BEGIN { 

   K  = NOFSEQ = ARGV[1] ;  # number of sequences in 'sequences'
   step = int(K/10+0.5) ;

   for( ; K >= 4 ; K = K - ( int(K/5) ? int(K/5) : 1 )  )
   {
       print K, step,  int(K/10 + 0.5) ;


       print "m\n" K "\nw\n10\n\nz\n2.0\nh\n100\ni\n0.1\nx\n\n" >"s";
       close("s");

       print "Call Pratt on file with " NOFSEQ " sequences, covering at least " K " of them" >"prattoutput";

       system( "RUNPRATTONCE.tcsh" );

	while ( getline < "mdl.special" > 0 )
		print > "prattoutput";

	close ("mdl.special");

   }

   system( "cp prattoutput prattoutput." NOFSEQ );


}'  $size

RUNPRATTONCE.tcsh

#!/bin/tcsh

unlimit
limit coredumpsize 0

pratt fasta sequences < s

Pratt parameters:

Pratt was run several times for each set of sequences, each time with a different value for the ``Minimum number of sequences to match a pattern'' parameter. c1, c2, and c3 parameters were different for different test runs, but all the other parameters were constant.

M: Min. number of sequences             10
 
B: nr of symbols in Block structure     20
S: nr of symbols in first Search        20
 
R: Refinement                           on
U: fUll refinement                     off
 
I: minimum Info contents               0.1
 
N: max Number of flexibilities           2
F: max Flexibility                       2
P: max flex Product                     10
Y: restricted flexibilitY               on
 
W: max Wildcard length                  10
L: max Length                           50
C: max num of Components                50
 
H: max length Hit list                  50
A: max number Alignments                50
 
O: filename Output patterns           sequences.10.pat
 
0: Alignment input flag                off
2: Use short seq to guide search        on
4: Sloppyness in match:                  0
 
5: Print Motifs in Sequences            on
6: Ratio for printing                   10
 
D: Diagnosticity analysis              off
8: Quotient increase threshold           5
9: Cardinality quotient                  3
 
T: Input tree                          off
#: Input Dist                          off
 
Z: braZma values:
c1                                            8.00
c2                                            8.00
c3                                           50.00

Page compiled by: Inge Jonassen.

main page chromo MDL test case.