*! version 1.2.7 PR/IRW 06dec2007. /* Recent history 1.2.7 04dec2007 Likelihood check for -logit-, -ologit- and -mlogit- from Ian White. 1.2.6 10nov2007 Updated version of -auglogit-, -augologit-, -augmlogit- from Ian White, changes # of augmented cases. 1.2.5 10oct2007 Updated version of -auglogit- from Ian White, also including -augologit-, -augmlogit-. nopp option added to suppress avoidance of perfect prediction bug. 1.2.4 31jul2007 Improved version of -auglogit- from Ian White. 1.2.3 30mar2007 Minor changes to -auglogit- and -pp_check- from Ian White. 1.2.2 27mar2007 Check for perfect prediction in logistic regression; new routine pp_check. Replace log(p/(1-p)) with logit(p), similarly for invlogit. 1.2.1 15dec2006 Allow -auglogit- to deal with perfect prediction in logistic regression. 1.2.0 30jun2006 Interval censoring implemented via interval() option. 1.1.0 03aug2005 Replace -draw- option with -match-. Default becomes draw. With prediction matching, randomly sort observations with identical predictions. Order variables in chained equations in order of increasing missingness. 1.0.4 21jun2005 Add sort, stable to enable reproducibility imputations with given seed */ program define uvis, rclass sortpreserve version 9 if `"`0'"'=="" { di as err "command required" exit 198 } gettoken cmd 0 : 0 if substr("`cmd'",1,3)=="reg" local cmd regress local normal=("`cmd'"=="regress")|("`cmd'"=="rreg")|("`cmd'"=="intreg") local binary=("`cmd'"=="logit")|("`cmd'"=="auglogit")|("`cmd'"=="logistic") local catcmd=("`cmd'"=="mlogit")|("`cmd'"=="augmlogit")|("`cmd'"=="ologit")|("`cmd'"=="augologit") if !`normal' & !`binary' & !`catcmd' { di as err "invalid or unrecognised command, `cmd'" exit 198 } syntax varlist(min=1 numeric) [if] [in] [aweight fweight pweight iweight] , Gen(string) /// [ noCONStant Delta(real 0) BOot INTerval(varlist min=1 max=2) MAtch nopp REPLACE /// SEed(int 0) * ] if "`cmd'"=="intreg" & "`match'"!="" { // interval censored response variables di as err "match not allowed with intreg" exit 198 } if "`replace'"=="" { confirm new var `gen' } if "`match'"=="match" { di as text "[imputing by prediction matching" _cont } else di as text "[imputing by drawing from conditional distribution" _cont if "`boot'"=="" { di as text " without bootstrap]" } else di as text " with bootstrap]" if "`constant'"=="noconstant" { local options "`options' nocons" } if "`cmd'"=="intreg" { gettoken ll rest : varlist gettoken ul xvars: rest local rest } else gettoken y xvars : varlist tempvar touse quietly { marksample touse, novarlist markout `touse' `xvars' /* note: does not include `y' */ * Deal with weights frac_wgt `"`exp'"' `touse' `"`weight'"' local wgt `r(wgt)' if `seed'!=0 { set seed `seed' } if "`cmd'"=="intreg" { tempvar y gen `y'=cond(missing(`ll') & missing(`ul'), ., 0) if `touse'==1 // 0 is arbitrary local yvarlist `ll' `ul' } else local yvarlist `y' * Code types of missings: 1=non-missing y, 2=missing y, 3=other missing tempvar obstype yimp gen byte `obstype'=1*(`touse'==1 & !missing(`y')) /* */ +2*(`touse'==1 & missing(`y')) /* */ +3*(`touse'==0) count if `obstype'==1 local nobs=r(N) count if `obstype'==2 local nmis=r(N) local type: type `y' gen `type' `yimp'=. * Fit imputation model (when cmd=intreg, yvar is ll ul, otherwise y) capture `cmd' `yvarlist' `xvars' `wgt', `options' local errno = c(rc) if `errno'>0 & !(`errno'==430 & "`cmd'"=="ologit") error `errno' // ignore non-convergence in ologit /* In the case of categorical regression, perform checks and use auglogit/augologit/augmlogit if perfect prediction detected. */ /* IRW: REORGANISED TO IMPROVE OUTPUT AND ENABLE LATER LIKCHECK - START*/ pp_check `cmd' local pp_cmd `s(cmd)' if "`pp_cmd'"!="" { noi di as txt "[perfect prediction detected: " _c if "`pp'"!="nopp" { tempvar augvar wtvar noi di as txt "using aug`pp_cmd' to impute " as res "`yvarlist'" as txt "]" aug `yvarlist' `xvars' `wgt', cmd(`pp_cmd') `options' augvar(`augvar') wtvar(`wtvar') } else noi di as text "no action taken]" } /* IRW: REORGANISED TO IMPROVE OUTPUT AND ENABLE LATER LIKCHECK - END */ tempname b e V chol bstar tempvar xb u matrix `b'=e(b) matrix `e'=e(b) matrix `V'=e(V) * intreg tags the lnsigma model onto the end of e(b) and e(V) - must be stripped if "`cmd'"=="intreg" { matrix `b'=`b'[1,"model:"] matrix `e'=`b' matrix `V'=`V'["model:","model:"] } local colsofb=colsof(`b') /* Check for zeroes on the diagonal of V and replace them with 1. Otherwise this makes the matrix non-positive definite. Can occur with mlogit or other commands when variables are dropped, giving zero variances. !! Is this safe to do? */ if diag0cnt(`V')>0 { forvalues j=1/`colsofb' { if `V'[`j',`j']==0 { matrix `V'[`j',`j']=1 noi di as err "[Warning: zero variance encountered for coefficient of variable `j']" } } } matrix `chol'=cholesky(`V') if `catcmd' { tempname cat local nclass=e(k_cat) /* number of classes in (ordered) categoric variable */ matrix `cat'=e(cat) /* row vector giving actual category values */ local cuts=`nclass'-1 } * Draw beta, and if necessary rmse, for proper imputation if `normal' { * draw rmse if "`cmd'"=="intreg" { local rmse=exp([lnsigma]_b[_cons]) local df=e(N)-`colsofb' } else { local rmse=e(rmse) local df=e(df_r) } local chi2=2*invgammap(`df'/2,uniform()) local rmsestar=`rmse'*sqrt(`df'/`chi2') matrix `chol'=`chol'*sqrt(`df'/`chi2') } * draw beta forvalues i=1/`colsofb' { matrix `e'[1,`i']=invnorm(uniform()) } matrix `bstar'=`b'+`e'*`chol'' /* START OF IRW CODE TO EVALUATE NORMAL APPROXIMATION IN CATEGORICAL CASE ONLY */ if "`cmd'"=="logit"|"`cmd'"=="logistic"|"`cmd'"=="mlogit"|"`cmd'"=="ologit" { tempname normapprox matrix `normapprox' = (`bstar'-`b')*syminv(`V')*(`bstar'-`b')' noi catlik, b(`b') local llhat = r(loglik) noi catlik, b(`bstar') local llstar = r(loglik) local diagnostic = `llstar'-`llhat'+`normapprox'[1,1]/2 noi di as text "[true/approx likelihood = " %5.3f as res exp(`llstar'-`llhat') /// as text "/" %5.3f as res exp(-`normapprox'[1,1]/2) /// as text " = " %5.3f as res exp(`diagnostic') as text "]" // Remove augmented observations, if perfect prediction has had to have been dealt with if "`augvar'"!="" { drop if `augvar' drop `augvar' `wtvar' } } /* END OF IRW CODE TO EVALUATE NORMAL APPROXIMATION */ if "`boot'"=="" { * Based on Ian White's code to implement van Buuren et al (1999). * draw y gen `u'=uniform() if `normal' | `binary' { * in normal or binary case, impute by sampling conditional distribution * or by prediction matching if "`match'"=="match" { * prediction matching tempvar etaobs etamis matrix score `etaobs'=`b' if `obstype'==1 matrix score `etamis'=`bstar' if `obstype'==2 * Include non-response location shift, delta. if `delta'!=0 { replace `etamis'=`etamis'+`delta' } match_normal `obstype' `nobs' `nmis' `etaobs' `etamis' `yimp' `y' } else { * sampling conditional distribution matrix score `xb'=`bstar' if `touse' if `normal' { if "`cmd'"=="intreg" { tempvar PhiA PhiB gen `PhiA'=cond(missing(`ll'), 0, norm((`ll'-`xb')/`rmsestar')) gen `PhiB'=cond(missing(`ul'), 1, norm((`ul'-`xb')/`rmsestar')) replace `yimp'=`xb'+`rmsestar'*invnorm(`u'*(`PhiB'-`PhiA')+`PhiA') } else replace `yimp'=`xb'+`rmsestar'*invnorm(`u') } else replace `yimp'=(`u'invlogit(`cutpt'-`xb') } } else { /* mlogit */ * care needed dealing with different possible base categories tempvar cusump sumexp local basecat=e(basecat) /* actual basecategory chosen by Stata */ gen `sumexp'=0 if `touse' forvalues i=1/`nclass' { tempvar xb`i' local thiscat=`cat'[1,`i'] if `thiscat'==`basecat' { gen `xb`i''=0 if `touse' } else matrix score `xb`i''=`bstar' if `touse', equation(`thiscat') replace `sumexp'=`sumexp' + exp(`xb`i'') } gen `cusump'=exp(`xb1')/`sumexp' forvalues i=2/`nclass' { replace `yimp'=`cat'[1,`i'] if `u'>`cusump' replace `cusump'=`cusump'+exp(`xb`i'')/`sumexp' replace `yimp'=. if missing(`xb`i'') } } } } } else { * Bootstrap method if "`match'"=="match" { /* match */ if `catcmd' { * predict class-specific probabilities and convert to logits forvalues k=1/`nclass' { local outk=`cat'[1,`k'] tempvar etaobs`k' etamis`k' predict `etaobs`k'' if `obstype'==1, outcome(`outk') /* probability */ replace `etaobs`k''=logit(`etaobs`k'') /* logit */ } } else { /* normal and binary cases */ tempvar etaobs etamis predict `etaobs' if `obstype'==1, xb } } * Bootstrap observed data tempvar wt gen double `wt'=. bsample if `obstype'==1, weight(`wt') if "`wgt'"!="" { replace `wt' `exp'*`wt' local w [`weight'=`wt'] } else local w [fweight=`wt'] cap `cmd' `yvarlist' `xvars' `w', `options' local errno = c(rc) if `errno'>0 & !(`errno'==430 & "`cmd'"=="ologit") error `errno' // ignore non-convergence in ologit pp_check `cmd' local pp_cmd `s(cmd)' if "`pp_cmd'"!="" { noi di as txt "[perfect prediction detected: " _c if "`pp'"!="nopp" { tempvar augvar wtvar noi di as txt "using aug`pp_cmd' to impute " as res "`yvarlist'" as txt "]" aug `yvarlist' `xvars' `w', cmd(`pp_cmd') `options' augvar(`augvar') wtvar(`wtvar') } else noi di as text "no action taken]" } if `catcmd' { if e(k_cat)<`nclass' { di as error "cannot predict outcome for all classes in bootstrap sample;" di as error "probably one or more classes has a low frequency in the original data:" di as error "try amalgamating small classes of `y' and rerunning" exit 303 } } if "`match'"=="match" { if `catcmd' { * predict class-specific probabilities and convert to logits forvalues k=1/`nclass' { local outk=`cat'[1,`k'] predict `etamis`k'' if `obstype'==2, outcome(`outk') /* probability */ replace `etamis`k''=logit(`etamis`k'') /* logit */ } * match sort `obstype', stable tempvar order distance gen `distance'=. gen long `order'=_n * For each missing obs j, find index of obs whose etaobs is closest to prediction [j]. forvalues i=1/`nmis' { local j=`i'+`nobs' * calc summed absolute distances between etamis* and etaobs* replace `distance'=0 in 1/`nobs' forvalues k=1/`nclass' { replace `distance'=`distance'+abs(`etamis`k''[`j']-`etaobs`k'') in 1/`nobs' } * Find index of smallest distance between etamis* and etaobs* sort `distance' local index=`order'[1] * restore correct order sort `order' replace `yimp'=`y'[`index'] in `j' } } else { /* normal and binary */ predict `etamis' if `obstype'==2, xb * Include non-response location shift, delta. if `delta'!=0 { replace `etamis'=`etamis'+`delta' } match_normal `obstype' `nobs' `nmis' `etaobs' `etamis' `yimp' `y' } } else { // draw matrix `bstar'=e(b) if "`cmd'"=="intreg" { matrix `bstar'=`bstar'[1, "model:"] local rmsestar=exp([lnsigma]_b[_cons]) // exp(ln RMSE) } else local rmsestar=e(rmse) gen `u'=uniform() if `normal' | `binary' { matrix score `xb'=`bstar' if `touse' if `normal' { if "`cmd'"=="intreg" { tempvar PhiA PhiB gen `PhiA'=cond(missing(`ll'), 0, norm((`ll'-`xb')/`rmsestar')) gen `PhiB'=cond(missing(`ul'), 1, norm((`ul'-`xb')/`rmsestar')) replace `yimp'=`xb'+`rmsestar'*invnorm(`u'*(`PhiB'-`PhiA')+`PhiA') } else replace `yimp'=`xb'+`rmsestar'*invnorm(`u') } else replace `yimp'=(`u'invlogit(`cutpt'-`xb') } } else { /* mlogit */ * care needed dealing with different possible base categories tempvar cusump sumexp local basecat=e(basecat) /* actual basecategory chosen by Stata */ gen `sumexp'=0 if `touse' forvalues i=1/`nclass' { tempvar xb`i' local thiscat=`cat'[1,`i'] if `thiscat'==`basecat' { gen `xb`i''=0 if `touse' } else matrix score `xb`i''=`bstar' if `touse', equation(`thiscat') replace `sumexp'=`sumexp' + exp(`xb`i'') } gen `cusump'=exp(`xb1')/`sumexp' forvalues i=2/`nclass' { replace `yimp'=`cat'[1,`i'] if `u'>`cusump' replace `cusump'=`cusump'+exp(`xb`i'')/`sumexp' replace `yimp'=. if missing(`xb`i'') } } } } if "`augvar'"!="" { drop if `augvar' drop `augvar' `wtvar' } } cap drop `gen' rename `yimp' `gen' if "`cmd'"=="intreg" replace `gen'=`ll' if !missing(`ll') & `ll'==`ul' // uncensored else replace `gen'=`y' if !missing(`y') lab var `gen' "imputed from `yvarlist'" } di _n as res `nmis' as txt " missing observations on " as res "`yvarlist'" as txt " imputed from " /// as res `nobs' as txt " complete observations." return local pp_cmd `pp_cmd' end program define match_normal * Prediction matching, normal or binary case. args obstype nobs nmis etaobs etamis yimp y quietly { * For each missing obs j, find index of observation * whose etaobs is closest to etamis[j]. tempvar sumgt tempname etamisi gen long `sumgt'=. * Sort etaobs within obstype sort `obstype' `etaobs', stable forvalues i=1/`nmis' { local j=`i'+`nobs' scalar `etamisi'=`etamis'[`j'] replace `sumgt'=sum((`etamisi'>`etaobs')) in 1/`nobs' sum `sumgt', meanonly local j1=r(max) if `j1'==0 { local index 1 local direction 1 } else if `j1'==`nobs' { local index `nobs' local direction -1 } else { local j2=`j1'+1 if (`etamisi'-`etaobs'[`j1'])<(`etaobs'[`j2']-`etamisi') { local index `j1' local direction -1 } else { local index `j2' local direction 1 } } * In case of tied etaobs values, add random offset to index in the appropriate direction count if `obstype'==1 & reldif(`etaobs', `etaobs'[`index'])<1e-7 // counts as equality scalar count`i'=r(N) if r(N)>1 { local index=`index'+`direction'*int(uniform()*r(N)) } replace `yimp'=`y'[`index'] in `j' } } end *! version 1.0.4 IW/PR 08oct2007 /* History version 0.4, 08oct2007: simplified version similar to 0.1, including ologit and mlogit also version 0.3, 12apr2007: augmentation follows X'X matrix (so that increase in X'X = old X'X / n) version 0.2, 29mar2007: weights y=0,1 in ratio 1-p:p version 0.1, 05dec2006: weights y=0,1 in ratio 1:1 */ *! 1.0 IRW 21sep07 program define _augment version 9 syntax varlist [if] [in] [fweight], [TOTALweight(real 0) TIMESweight(real 0) /// noPREServe list wtvar(string) augvar(string) noequal] if "`wtvar'"=="" local wtvar _weight if "`augvar'"=="" local augvar _augment confirm new var `wtvar' `augvar' marksample touse gettoken y xlist: varlist // Remove collinearities _rmcoll `xlist' if `touse' local xlist `r(varlist)' local nx: word count `xlist' tempvar augment wt if "`weight'"!="" gen `wt' `exp' else gen `wt' = 1 qui levelsof `y' if `touse', local(ylevels) qui tab `y' if `touse' [fweight=`wt'] local nylevels = r(r) local totw = r(N) local Nold = _N // Set total weight of added observations if `totalweight'>0 & `timesweight'>0 { di as error "Can't specify totalweight() and timesweight()" exit 498 } if `totalweight' == 0 { local totalweight = `nx' + 1 /*IRW: matches Clogg++91 and Firth93 */ if `timesweight'>0 local totalweight = `totalweight' * `timesweight' } /* global augmented `totalweight' */ // Number of added observations local Nadd = `nx'*2*`nylevels' local Nnew = `Nold'+`Nadd' di as text "Adding " as result `Nadd' as text " pseudo-observations with total weight " as result `totalweight' qui { set obs `Nnew' gen `augment' = _n>`Nold' local thisobs `Nold' foreach x of var `xlist' { sum `x' [fw=`wt'] if `touse' & !`augment' local mean = r(mean) local sd = r(sd)*sqrt((r(N)-1)/r(N)) replace `x' = `mean' if `augment' foreach yval in `ylevels' { sum `wt' if `y'==`yval' & `touse' & !`augment', meanonly local py = cond("`equal'"=="noequal", r(sum)/`totw', 1/`nylevels') foreach xnewstd in -1 1 { local ++thisobs replace `x' = `mean'+(`xnewstd')*`sd' in `thisobs' replace `y' = `yval' in `thisobs' replace `wt'= `totalweight' * `py' / (2*`nx') in `thisobs' } } } } rename `wt' `wtvar' rename `augment' `augvar' if "`list'"=="list" { di as txt _newline "Listing of added records:" list `xlist' `y' _weight if _augment, sep(4) } end program define aug syntax varlist [if] [in] [fweight], cmd(string) [TOTALweight(passthru) /// TIMESweight(passthru) wtvar(string) augvar(string) equal *] if "`wtvar'"=="" tempvar wtvar if "`augvar'"=="" tempvar augvar if "`weight'"!="" local weightexp [`weight'`exp'] _augment `varlist' `if' `in' `weightexp', `totalweight' `timesweight' `equal' wtvar(`wtvar') augvar(`augvar') `cmd' `varlist' [iw=`wtvar'], `options' // `cmd' is logit, ologit or mlogit - returned in `s(cmd)' by pp_check end program define pp_check, sclass version 9 /* Check for perfect prediction in logistic regression. If found, use aug to fit model. */ args cmd sreturn clear if "`cmd'"=="logit" | "`cmd'"=="logistic" { tempname erules // Check if rules invoked or perfect prediction found matrix `erules' = e(rules) local c = colsof(`erules') local nperfect = e(N_cdf)+e(N_cds) if missing(`nperfect') local nperfect 0 forvalues nc = 1/`c' { local nperfect = `nperfect'+(`erules'[1, `nc'] > 0) } if `nperfect'>0 { // rules or perfect prediction found - will invoke auglogit sreturn local cmd logit } } else if "`cmd'"=="ologit" { // Check if perfect prediction found if e(N_cd)>0 { // perfect prediction found - will invoke augologit sreturn local cmd ologit } } else if "`cmd'"=="mlogit" { // Check if perfect prediction found levelsof `e(depvar)', local(ylevels) tempvar p foreach level in `ylevels' { predict `p', outcome(`level') sum `p', meanonly if r(min)<1E-9 { // perfect prediction - will invoke augmlogit sreturn local cmd mlogit continue, break } drop `p' } } end program define catlik, rclass // Return log-likelihoods for logit/mlogit/ologit at a given parameter vector b syntax, b(string) local y `e(depvar)' local cmd `e(cmd)' tempvar wt if "`e(wtype)'"!="" gen double `wt' `e(wexp)' else gen double `wt'=1 if "`e(offset)'"!="" { di as error "catlik: offset not allowed" exit 498 } quietly { if "`cmd'"=="mlogit" | "`cmd'"=="ologit" { tempname ecat matrix `ecat' = e(cat) local nylevels = e(k_cat) tempvar ll xb psum gen double `ll' = . } if "`cmd'"=="mlogit" { gen double `psum' = 1 local eqno 1 forvalues i=1/`nylevels' { cap drop `xb' local yval = `ecat'[1,`i'] if `yval'!=e(basecat) { matrix score `xb' = `b', eq(#`eqno') replace `psum' = `psum'+exp(`xb') replace `ll' = `xb' if `y' == `yval' local ++eqno } else gen double `xb' = 0 replace `ll' = `xb' if `y'==`yval' } replace `ll' = `ll' - log(`psum') } else if "`cmd'"=="ologit" { tempvar p psumlag gen double `p' = . gen double `psumlag' = 0 matrix score `xb' = `b' forvalues i = 1/`nylevels' { local yval = `ecat'[1,`i'] if `i'<`nylevels' { local cut = `b'[1,colsof(`b')-`nylevels'+`i'+1] replace `p' = invlogit(`cut'-`xb')-`psumlag' replace `psumlag' = `psumlag'+`p' } else replace `p' = 1-`psumlag' replace `ll' = log(`p') if `y' == `yval' } } else if "`cmd'"=="logit" | "`cmd'"=="logistic" { tempvar ll xb matrix score `xb' = `b' gen double `ll' = (`y'!=0)*`xb'-log(1+exp(`xb')) } else { di as error "catlik doesn't work after regression command `cmd'" exit 498 } replace `ll' = `wt'*`ll' sum `ll' if e(sample), meanonly return scalar loglik = r(sum) } end