// lsq.mata 1.3 02.06.2006 // Mata libary for sq.pkg // Ulrich Kohler and Magdalena Luniak, WZB // Support: kohler@wz-berlin, luniak@wz-berlin.de // Disclaimer // ---------- // // In the code below, those parts headed by the term "Magda-Code" are // based on the source code of TDA, written by Goetz Rohwer and Ulrich Poetter. // TDA is a very powerful program for Transitory Data Analysis. It is programmed // in C, and distrubuted as FREEWARE under the terms of the General Public License. It is // downloadable from http://www.stat.ruhr-uni-bochum.de/tda.html. // // My understanding of the GPL is that you can freely change and redistribute the // respective parts of the programs, provided that you mention the authorship of // Goetz Rohwer and Ulrich Poetter, and that you publish the source code of the // code based on the parts headed with "Magda-Code". // // For Copyright issues pleae refer to the General Public License which can // can be found on ttp://www.gnu.org/copyleft/gpl.html version 9.1 mata: mata clear mata set matastrict on // --------------------------------------------------------------- // Caller for the Needleman-Wunsch Algorithm for reference-sequence void sqomref( string scalar varlist, real scalar indelcost, string scalar standard, real scalar k, real scalar subcost) { // Initialize variables real matrix X // Data-Matrix real colvector D // Distance-Variable real rowvector R // 1st Selected Sequence (Row of Levensthein) real rowvector C // Reference-Sequence (Col of Levensthein) real scalar i real scalar j real scalar mod // Hard coded modulo parameter for hashing mod = 6709 // View on sequence data in wide format st_view(X=.,.,tokens(varlist)) // Reference sequence C = X[rows(X),1..sqlength(X[rows(X),.])] // Initialize distance matrix D = J(rows(X),1,0) // Call NeedlemanWunsch with fixed subcosts (the default) if (subcost>0) { for (i=1;i<=rows(X)-1;i++) { R = X[i,1..sqlength(X[i,.])] if (k==0) { D[i,1] = needlemanwunschexactfixed( R,C,indelcost,standard,subcost) } else { D[i,1] = needlemanwunschapproxfixed( R,C,indelcost,k,standard,subcost) } } D = (standard=="longest" ? D:/(cols(X)) : D ) st_addvar("float","_SQdist") st_store(.,"_SQdist",D) } // Call NeedlemanWunsch with rawdistance else if (subcost==-1) { for (i=1;i<=rows(X)-1;i++) { R = X[i,1..sqlength(X[i,.])] if (k==0) { D[i,1] = needlemanwunschexactrawdist( R,C,indelcost,standard) } else { D[i,1] = needlemanwunschapproxrawdist( R,C,indelcost,k,standard) } } D = (standard=="longest" ? D:/(cols(X)) : D ) st_addvar("float","_SQdist") st_store(.,"_SQdist",D) } // Call NeedlemanWunsch with full subcost-matrix else if (subcost == 0) { // Initialize material for hashing real matrix SQsubcost real vector levels struct elemlist vector hashtable SQsubcost = st_matrix("SQsubcost") levels = st_matrix("levels") hashtable = hashing(levels,mod) for (i=1;i<=rows(X)-1;i++) { R = X[i,1..sqlength(X[i,.])] if (k==0) { D[i,1] = needlemanwunschexactmatrix( R,C,indelcost,mod,SQsubcost,standard,hashtable) } else { D[i,1] = needlemanwunschapproxmatrix( R,C,indelcost,k,standard,mod,SQsubcost,hashtable) } } D = (standard=="longest" ? D:/(cols(X)) : D ) st_addvar("float","_SQdist") st_store(.,"_SQdist",D) } } // --------------------------------------------------------------- // Caller for the Needleman-Wunsch Algorithm for full distance Matrix void sqomfull( string scalar varlist, real scalar indelcost, string scalar standard, real scalar k, real scalar subcost) { // Initialize variables real matrix X // Data-Matrix real matrix D // Distance-Matrix real rowvector R // 1st Selected Sequence (Row of Levensthein) real rowvector C // 2nd Selected Sequence (Col of Levensthein) real scalar i real scalar j real scalar mod // Hard coded modulo parameter for hashing mod = 6709 // View on sequence data in wide format st_view(X=.,.,tokens(varlist)) // Initialize distance matrix D = J(rows(X),rows(X),0) // Call NeedlemanWunsch with fixed subcosts (the default) if (subcost>0) { for (i=1;i<=rows(X)-1;i++) { for (j=i+1;j<=rows(X);j++) { R = X[i,1..sqlength(X[i,.])] C = X[j,1..sqlength(X[j,.])] if (k==0) { D[j,i] = needlemanwunschexactfixed( R,C,indelcost,standard,subcost) } else { D[j,i] = needlemanwunschapproxfixed( R,C,indelcost,k,standard,subcost) } } } D = (standard=="longest" ? D:/(cols(X)) : D ) st_matrix("SQdist",(makesymmetric(D))) } // Call NeedlemanWunsch with rawdistance else if (subcost==-1) { for (i=1;i<=rows(X)-1;i++) { for (j=i+1;j<=rows(X);j++) { R = X[i,1..sqlength(X[i,.])] C = X[j,1..sqlength(X[j,.])] if (k==0) { D[j,i] = needlemanwunschexactrawdist( R,C,indelcost,standard) } else { D[j,i] = needlemanwunschapproxrawdist( R,C,indelcost,k,standard) } } } st_matrix("SQdist",(makesymmetric(D))) } // Call NeedlemanWunsch with full subcost-matrix else if (subcost == 0) { // Initialize material for hashing real matrix SQsubcost real vector levels struct elemlist vector hashtable SQsubcost = st_matrix("SQsubcost") levels = st_matrix("levels") hashtable = hashing(levels,mod) for (i=1;i<=rows(X)-1;i++) { for (j=i+1;j<=rows(X);j++) { R = X[i,1..sqlength(X[i,.])] C = X[j,1..sqlength(X[j,.])] if (k==0) { D[j,i] = needlemanwunschexactmatrix( R,C,indelcost,mod,SQsubcost,standard,hashtable) } else { D[j,i] = needlemanwunschapproxmatrix( R,C,indelcost,k,standard,mod,SQsubcost,hashtable) } } } st_matrix("SQdist",(makesymmetric(D))) } } // --------------------------------------------------------------- // "Exact" Needleman-Wunsch Algorithm with fixed subcosts real scalar needlemanwunschexactfixed( real rowvector R, real rowvector C, real scalar indelcost, string scalar standard, real scalar subcost) { // Initializations real matrix L // Levensthein-Matrix real rowvector M // Vector of Values to be Minimized real scalar i real scalar j // Levensthein-Matrix L = J(cols(R)+1,cols(C)+1,0) // Initialize First Row/Col of Levensthein for (i=2;i<=cols(R)+1;i++) { L[i,1]=L[i-1,1]+indelcost } for (j=2;j<=cols(C)+1;j++) { L[1,j]=L[1,j-1]+indelcost } // Step thru the Levensthein for (i=2;i<=cols(R)+1;i++) { for (j=2;j<=cols(C)+1;j++) { M = L[i-1,j-1]+ (R[1,i-1]==C[1,j-1]? 0 : subcost), L[i-1,j]+indelcost, L[i,j-1]+indelcost L[i,j]=min(M) } } return((L[cols(R)+1,cols(C)+1])/(standard=="longer" ? max((cols(R),cols(C))): 1) ) } // --------------------------------------------------------------- // "Exact" Needleman-Wunsch Algorithm with rawdistance-subcosts real scalar needlemanwunschexactrawdist( real rowvector R, real rowvector C, real scalar indelcost, string scalar standard) { // Initializations real matrix L // Levensthein-Matrix real rowvector M // Vector of Values to be Minimized real scalar i real scalar j // Initialize Levensthein-Matrix L = J(cols(R)+1,cols(C)+1,0) // Initialize First Row/Col of Levensthein for (i=2;i<=cols(R)+1;i++) { L[i,1]=L[i-1,1]+indelcost } for (j=2;j<=cols(C)+1;j++) { L[1,j]=L[1,j-1]+indelcost } // Step thru the Levensthein for (i=2;i<=cols(R)+1;i++) { for (j=2;j<=cols(C)+1;j++) { M = L[i-1,j-1]+ abs(R[1,i-1]-C[1,j-1]), L[i-1,j]+indelcost, L[i,j-1]+indelcost L[i,j]=min(M) } } return((L[cols(R)+1,cols(C)+1])/(standard=="longer" ? max((cols(R),cols(C))): 1) ) } // --------------------------------------------------------------- // "Exact" Needleman-Wunsch Algorithm with Subcost Matrix real scalar needlemanwunschexactmatrix( real rowvector R, real rowvector C, real scalar indelcost, real scalar mod, real matrix SQsubcost, string scalar standard, struct elemlist vector hashtable) { // Initializations real matrix L // Levensthein-Matrix real rowvector M // Vector of Values to be Minimized real scalar i real scalar j // Initialize Levensthein-Matrix L = J(cols(R)+1,cols(C)+1,0) // Initialize First Row/Col of Levensthein for (i=2;i<=cols(R)+1;i++) { L[i,1]=L[i-1,1]+indelcost } for (j=2;j<=cols(C)+1;j++) { L[1,j]=L[1,j-1]+indelcost } // Step thru the Levensthein for (i=2;i<=cols(R)+1;i++) { for (j=2;j<=cols(C)+1;j++) { M = L[i-1,j-1]+subcosthash(R,C,i-1,j-1,mod,SQsubcost,hashtable), L[i-1,j]+indelcost, L[i,j-1]+indelcost L[i,j]=min(M) } } return((L[cols(R)+1,cols(C)+1])/(standard=="longer" ? max((cols(R),cols(C))): 1) ) } // --------------------------------------------------------------- // Approx-Needlman Wunsch with fixed subcost (Magda Code) real scalar needlemanwunschapproxfixed( real rowvector R, real rowvector C, real scalar indelcost, real scalar paramK, string scalar standard, real scalar subcost) { // Initializations real scalar distance // return value distance = 0 // Auxiliary Variables real scalar inser real scalar del real scalar help1 real scalar help2 real scalar help3 real scalar index1 real scalar index2 real scalar i real scalar j // Length of sequences real scalar iLength real scalar jLength iLength = sqlength(R) jLength = sqlength(C) // Dimension of Levensthein-matrix real scalar matrixDim real vector v v = iLength, jLength matrixDim = max(v)+1 // Boundary of Levensthein-matrix real scalar bound bound = (iLength/jLength + jLength/iLength)/sqrt(iLength*iLength+jLength*jLength) bound = paramK * bound / sqrt(2) // Initialisation of Levensthein-matrix // Note: Levensthein as vector and initialized with infinity real matrix L L = J(matrixDim*matrixDim, 1, .) L[1,1]=0 for(i=1; i=jLength+1) { j2 = jLength +1 } else { j2 = floor(help3) } // Coordinates for Levensthein-matrix(vector) index1 = (i-1)*matrixDim + j1 index2 = index1 + matrixDim // 2. loop: the second sequence for(j=j1; jj1 || j==1) { inser = indelcost + L[index2, 1] if(distance>inser) { distance= inser } } index2++ // Deletion if(jdel) { distance=del } L[index2,1]=distance } j3=j2 } } return(distance/(standard=="longer" ? max((cols(R),cols(C))):1)) } // --------------------------------------------------------------- // Approx-Needlman Wunsch with rawdistance (Magda Code) real scalar needlemanwunschapproxrawdist( real rowvector R, real rowvector C, real scalar indelcost, real scalar paramK, string scalar standard) { // Initializations real scalar distance //return value distance = 0 // Auxiliary variables real scalar inser real scalar del real scalar help1 real scalar help2 real scalar help3 real scalar index1 real scalar index2 real scalar i real scalar j // Length of sequences real scalar iLength real scalar jLength iLength = sqlength(R) jLength = sqlength(C) //Dimension of Levensthein-matrix real scalar matrixDim real vector v v = iLength, jLength matrixDim = max(v)+1 //Boundary of Levensthein-matrix real scalar bound bound = (iLength/jLength + jLength/iLength)/sqrt(iLength*iLength+jLength*jLength) bound = paramK * bound / sqrt(2) // Initialisation of Levensthein-matrix // Note: Levensthein as vector and initialized with infinity real matrix L L = J(matrixDim*matrixDim, 1, .) L[1,1]=0 for(i=1; i=jLength+1) { j2 = jLength +1 } else { j2 = floor(help3) } // Coordinates for Levensthein-matrix(vector) index1 = (i-1)*matrixDim + j1 index2 = index1 + matrixDim //2. loop: the second sequence for(j=j1; jj1 || j==1) { inser = indelcost + L[index2, 1] if(distance>inser) { distance= inser } } index2++ // Deletion if(jdel) { distance=del } L[index2,1]=distance } j3=j2 } } return(distance/(standard=="longer" ? max((cols(R),cols(C))):1)) } // --------------------------------------------------------------- // Approx-Needlman Wunsch with full subcost matrix (Magda Code) real scalar needlemanwunschapproxmatrix( real rowvector R, real rowvector C, real scalar indelcost, real scalar paramK, string scalar standard, real scalar mod, real matrix SQsubcost, struct elemlist vector hashtable) { // Initializations real scalar distance //return value distance = 0 // Auxiliary variables real scalar inser real scalar del real scalar help1 real scalar help2 real scalar help3 real scalar index1 real scalar index2 real scalar i real scalar j // Length of sequences real scalar iLength real scalar jLength iLength = sqlength(R) jLength = sqlength(C) //Dimension of Levensthein-matrix real scalar matrixDim real vector v v = iLength, jLength matrixDim = max(v)+1 //Boundary of Levensthein-matrix real scalar bound bound = (iLength/jLength + jLength/iLength)/sqrt(iLength*iLength+jLength*jLength) bound = paramK * bound / sqrt(2) // Initialisation of Levensthein-matrix // Note: Levensthein as vector and initialized with infinity real matrix L L = J(matrixDim*matrixDim, 1, .) L[1,1]=0 for(i=1; i=jLength+1) { j2 = jLength +1 } else { j2 = floor(help3) } // Coordinates for Levensthein-matrix(vector) index1 = (i-1)*matrixDim + j1 index2 = index1 + matrixDim // 2. loop: the second sequence for(j=j1; jj1 || j==1) { inser = indelcost + L[index2, 1] if(distance>inser) { distance= inser } } index2++ // Deletion if(jdel) { distance=del } L[index2,1]=distance } j3=j2 } } return(distance/(standard=="longer" ? max((cols(R),cols(C))):1)) } // --------------------------------------------------------------- // Extract the sequence-length real scalar sqlength(transmorphic rowvector X) { real scalar i real scalar col for (i=1;i<=cols(X);i++) { if (X[1,i] != missingof(X)) col=i } return(col) } // -----------Hashing-Table extraction of Subcosts--------------- // (Thanks to gould@stata.com, rgates@stata.com, and Phil Schumm) // Element of the list struct element { real scalar key // Element of sequence real scalar value // Place in distance matrix pointer(struct element scalar) scalar nextelem } // --------------------------------------------------------------- // Create an element of a list struct element scalar function newelem(real scalar key, real scalar value) { struct element scalar e e.key = key e.value = value return(e) } // --------------------------------------------------------------- // Defining of linked list of elements struct elemlist { struct element scalar first pointer(struct element scalar) scalar last } // --------------------------------------------------------------- // Linking of a new element at the end of the list void linkelement( struct elemlist scalar list, real scalar key, real scalar value) { // Set first element if list is empty if(list.first.key == .) { list.first.key = key list.first.value = value list.last = &(list.first) } // Link new element at the end of non empty list else { list.last->nextelem = &(newelem(key, value)) list.last = list.last->nextelem } } // --------------------------------------------------------------- // Create hash table of all possible elements of sequences (seq) struct elemlist vector function hashing( real vector seq, real scalar mod) { struct elemlist vector hashtable real scalar i real scalar hashvalue real scalar seqlength real scalar key hashtable = elemlist(mod) seqlength = length(seq) for(i=1; i<=seqlength; i++){ key = seq[i] hashvalue = mod(ceil(seq[i]), mod) + 1 linkelement(hashtable[hashvalue], key, i) } return(hashtable) } // --------------------------------------------------------------- // Extract subtitution cost real scalar subcosthash( real rowvector R, real rowvector C, real scalar a, real scalar b, real scalar mod, real matrix SQsubcost, struct elemlist vector hashtable) { real scalar key1 real scalar key2 real scalar hashvalue1 real scalar hashvalue2 real scalar addr1 real scalar addr2 struct elemlist scalar list pointer(struct element scalar) scalar el key1 = R[1,a] key2 = C[1,b] hashvalue1 = mod(ceil(key1),mod) +1 hashvalue2 = mod(ceil(key2),mod) +1 // First address list = hashtable[hashvalue1] el = &(list.first) if(el->key == key1) { // demanded element is 1st element list addr1 = el->value } else { // demanded element is ~1st element list while(el->nextelem != NULL) { el = el->nextelem if(el->key = key1) { addr1 = el->value break } } } // Second address list = hashtable[hashvalue2] el = &(list.first) if(el->key == key2) { //demanded element 1st element of list addr2 = el->value } else { //demanded element is ~1st element of list while(el->nextelem != NULL) { el = el->nextelem if(el->key = key2) { addr2 = el->value break } } } return(SQsubcost[addr1, addr2]) } // ----End of Hashing-Table extraction of Subcosts--------------- // Compile into a libary mata mlib create lsq, replace mata mlib add lsq /// sqomref() sqomfull() /// <- Callers needlemanwunschexactfixed() /// <- Exact NW with fixed subcost needlemanwunschexactrawdist() /// <- Exact NW with rawdist needlemanwunschexactmatrix() /// <- Exact NW with subcost matrix needlemanwunschapproxfixed() /// <- Approx. NW with fixed subcost needlemanwunschapproxrawdist() /// <- Approx. NW with rawdist needlemanwunschapproxmatrix() /// <- Approx. NW with subcost matrix sqlength() /// <- Extract Sequence Length subcosthash() /// <- Extract subtitution cost element() /// <- Structure elem elemlist() /// <- Structure elemlist newelem() /// <- New element of linked list linkelement() /// <- Link a new element at end of list hashing() // <- Hash table mata mlib index end exit Support: kohler@wz-berlin.de