*! version 1.6.0 November 13, 2006 @ 14:04:05 *! Perform an optimal matching with Needleman-Wunsch-Algorithm *! Thanks to Wikipedia.de for nice description of NW * Source: http://de.wikipedia.org/wiki/Needleman-Wunsch-Algorithmus *1.0 Initial version *1.1 undocumented changes *1.2 undocumented changes *1.3 undoucmented changes *1.4 Version distrubted on SSC *1.5 Bug fix. Sorting != sqclusterdat -> also needed a fix in sqclusterdat.ado *1.6 Bug fix. reshape wide not possible if cov. varies within id program sqom, rclass version 9.1 syntax [if] [in] [, /// k(int 0) INDELcost(real 1) name(string) /// SUBcost(string) STandard(string) REFseqid(string) full ] // SQ-Data // ------- if "`_dta[SQis]'" == "" { di as error "data not declared as SQ-data; use -sqset-" exit 9 } // Store definition of sqom-sample for sqclusterdat // ------------------------------------------------ char _dta[SQomsample] `if' `in' // Declarations // ------------ tempvar gap epiid epicount SQom SQid cutter N tempfile SQorig SQrefseq noshrink results // Defaults // -------- local full = cond("`full'"=="full",1,0) if !`full' capture drop _SQdist quietly { // Error Checks // ------------ if "`name'" != "" confirm new variable `name' // Negative Substitution Costs? local check = real("`subcost'") if `check' <= 0 { noi di as error "subcost(<=0) invalid" exit 198 } // Construct the subcost matrix // ---------------------------- if "`subcost'" == "" local subcost = `indelcost'*2 capture confirm number `subcost' if _rc { if "`subcost'" == "rawdistance" { capture confirm numeric variable `_dta[SQis]' if _rc { noi di as error /// "subcost(rawdistance) not allowed with string variables" exit 189 } local subcost -1 } else { capture matrix SQsubcost = `subcost' if _rc { noi di as error /// "subcost() invalid: specify number, -rawdistance- or matrix" exit 198 } levelsof `_dta[SQis]', local(Element) local levels: subinstr local Element `" "' `","' , all matrix levels = `levels' matrix rownames SQsubcost = `Element' matrix colnames SQsubcost = `Element' local subcost 0 } } // Shrink the Data -> Speed // ------------------------ preserve marksample touse // Drop Sequences with Gaps if "`gapinclude'" == "" { tempvar lcensor rcensor gap by `_dta[SQiis]' (`_dta[SQtis]'), sort: gen `lcensor' = sum(!mi(`_dta[SQis]')) by `_dta[SQiis]' (`_dta[SQtis]'): gen `rcensor' = sum(mi(`_dta[SQis]')) by `_dta[SQiis]' (`_dta[SQtis]'): /// replace `rcensor' = ((_N-_n) == (`rcensor'[_N]-`rcensor'[_n])) & mi(`_dta[SQis]') by `_dta[SQiis]' (`_dta[SQtis]'): /// gen `gap' = sum(mi(`_dta[SQis]') & `lcensor' & !`rcensor') by `_dta[SQiis]' (`_dta[SQtis]'): /// replace `touse' = 0 if `gap'[_N]>0 } keep if `touse' if _N == 0 { noi di as text "(No observations)" exit } drop `lcensor' `rcensor' // Handle Standardisation Option capture confirm integer number `standard' if !_rc { drop if `_dta[SQtis]' > `standard' local standard none } else if "`standard'" == "cut" { drop if `_dta[SQis]' >= . & !`gap' by `_dta[SQiis]' (`_dta[SQtis]'), sort: gen `cutter' = _N if !`gap' sum `cutter', meanonly drop if `_dta[SQtis]' > r(min) local standard none } else if "`standard'" == "" { local standard longest } else if "`standard'" != "none" & "`standard'" != "longer" { noi di as error "standard() invalid" exit 198 } // Reshape Wide keep `_dta[SQis]' `_dta[SQiis]' `_dta[SQtis]' reshape wide `_dta[SQis]', i(`_dta[SQiis]') j(`_dta[SQtis]') unab varlist: `_dta[SQis]'* // Store reference sequence if "`refseqid'" != "" { save `results' capture confirm numeric variable `_dta[SQiis]' if !_rc { count if `_dta[SQiis]' == `refseqid' capture assert r(N) > 0 if _rc { noi di as error "reference sequence does not exist" exit 198 } keep if `_dta[SQiis]' == `refseqid' } else { count if `_dta[SQiis]' == "`refseqid'" capture assert r(N) > 0 if _rc { noi di as error "reference sequence does not exist" exit 198 } keep if `_dta[SQiis]' == "`refseqid'" } save `SQrefseq' use `results', clear } // Store a copy by `varlist', sort: gen `SQid' = 1 if _n==1 replace `SQid' = sum(`SQid') sort `SQid' save `noshrink' // Keep only one Sequence of each type by `varlist', sort: gen `N' = _N by `SQid', sort: keep if _n==1 if "`refseqid'" != "" append using `SQrefseq' sort `SQid' // Call the Mata-Function from lsq.lib (Source-Code: lsq.mata) if !`full' { capture drop _SQdist mata: sqomref("`varlist'",`indelcost',"`standard'",`k',`subcost') drop if `SQid'==. sort `SQid' save `results', replace use `noshrink', clear merge `SQid' using `results' assert _merge ==3 drop _merge keep `_dta[SQiis]' _SQdist label var _SQdist /// `"sqom with k(`k') indel(`indelcost') subcost(`subcost') refseqid(`refseqid') "' if "`name'" != "" ren _SQdist `name' sort `_dta[SQiis]' save `results', replace restore sort `_dta[SQiis]' merge `_dta[SQiis]' using `results' assert _merge!=2 drop _merge noi di as text `"Distance Variable saved as"' /// as res `" `=cond("`name'"=="","_SQdist","`name'")' "' } if `full' { noi di as text "Perform " /// as res _N*(_N-1)/2 /// as text " Comparisons with Needleman-Wunsch Algorithm" mata: sqomfull("`varlist'",`indelcost',"`standard'",`k',`subcost') restore noi di as text "Distance matrix saved as " as res "SQdist" } } // Return // ------ return local name =cond("`name'"=="","_SQdist","`name'") end exit