# file o:/NonTaxonic/MAXSLOPE.S MAXSLOPE <- function(data, adjust=T, print.flag=T, dbug=F) { #---------------------------------------------------------------------------# # Driver program for MAXSLOPE # # Generates all non-redundant pairs of variables in dataset, runs # # MAXSLOPEpair() for each pair, accumulates results into one big list, # # and returns list to caller # # Author: William M. Grove # # Language: R # # Last modified: July 22, 2003 # # # # Args: # # data = data matrix to be analyzed (columns correspond to # # indicator variables, rows to observations # # adjust = T means adjust for non-zero within-class regressions, # # F means do not adjust (pretend within-class regression # # slopes are all zero) # # print.flag = T if graphs on screen, error messages to console, etc., # # are wanted # # F if not (e.g., batch/simulation use) # # Note: Whether print.flag = T or F, all key results # # are saved and returned by routine. # # dbug = T means print out intermediate results, # # F means don't print them # # # # Returns list with 3 components. They are, in order: # # allresults = vector of lists, 1 element-list per pair of indicator # # variables. Each element-list has components: # # i = column # of one indicator, which was the one first # # used as input indicator in MAXSLOPEpair() # # j = column # of other indicator (note: j > i always) # # reg.yx.adj = regression of Y on X, with effect of (assumed linear, # # assumed common to both taxon & complement class) # # within-class regression adjusted out (of spline- # # smoothed regression) # # slope.yx = slope of regression of Y on X, at X=x, based on # # spline-smoothed regression # # reg.xy.adj = regression of X on Y, with effect of (assumed linear, # # assumed common to both taxon & complement class) # # within-class regression adjusted out (of spline- # # smoothed regression) # # slope.xy = slope of regression of X on Y, at Y,=y based on # # spline-smoothed regression # # maxslope.pt = value of indicator where conditional slope is # # is maximized (2-element vector, 1st is where slope.ji # # is maximized on indicator i, 2nd is where slope.ij is # # maximized on indicator j) # # max.slope = value of smoothed slope at maximum, NOT location of # # maximum (2-element vector, 1st is for slope.ji, 2nd # # for slope.ij) # # hitmax.cut = location of HITMAX cut (2-element vector, 1st is for # # indicator i, 2nd for indicator j) # # P = estimates of big P (mixing proportion)---1st element # # based on behavior of Y (indic. j), 2nd on X (indic i) # # mu.X = taxon & C.C. means for X (indicator i) # # mu.Y = taxon & C.C. means for Y (indicator j) # # var.X = taxon & C.C. variances for X (indic i) # # var.Y = taxon & C.C. variances for Y (indic j) # # b1.within = within-class regression slope as 2-element vector # # (1st for Y on X, 2nd for X on Y) # # bayeprob = Bayes posterior probability of taxon membership, # # given knowledge only of a single variable (4 col # # matrix, 1st col=X-value, 2nd col=post. prob. when # # only X is known, 3rd=Y-value, 4th=post. prob. when # # only Y is known) # # density.X = estimated latent frequency curves, 1st col= ordinate # # for i_th indicator, 2nd col= curve for taxon, 3rd # # col= for C.C. # # density.Y = est'd freq curves, 1st col= ordinate for j_th # # indicator, 2nd col= taxon curve, 3rd col= C.C. curve # # errmsg = text message, explaining any error encountered # # stats = list with components: # # P = average P estimate across indicator pairs (scalar) # # mu = avg est's of class means, 1st row=taxon, 2nd=C.C., # # col denotes which variable # # vars = avg est's of class variances, 1st row=taxon, 2nd=C.C., # # col denotes which vbl # # bayeprob = Bayes posterior probability of taxon membership, given # # knowledge of all variables # # density = avg est's of latent frequency curves---3*k col matrix, # # cols are in triplets---1st=ordinate, 2nd=taxon curve, # # 3rd=C.C. curve # # errmsg = list with one component: # # errmsg = text message, explaining any error encountered # # # # WARNING TO THE USER!!! # # Note: If one relies on assumption that MAXSLOPE and HITMAX coincide, # # parameter estimates can be off by a whole lot, and systematically too. # # This equality holds for Gaussian admixture (with common within- # # population variance), but not necessarily for other mixtures. # # This is why one should, instead of relying on the dubious assump- # # tion that MAXCOV=HITMAX, use the parameter estimates the procedure # # delivers, which estimate HITMAX in a different way, and then use that # # HITMAX estimate (plus conditional variance information and overall # # covariance information) to estimate P and taxon mean-complement class # # mean for each indicator variable in pair. # # For proof of difference between MAXSLOPE point and HITMAX cut, for # # general mixtures, see Grove, W.M. (submitted), "A More Rigorous # # Derivation of the MAXSLOPE Taxometric Method: Relationship of # # MAXSLOPE Point to HITMAX Cut." # # # # On the other hand, the general behavior of the local regression slope as # # a function of X (input variable) (i.e., "sombrero shape" with # # conditional slope increasing from low values of input variable, # # reaching maximum somewhere in interior of its range, then declining # # again to asymptote at high values of X, does appear to be a reasonable # # graphical indication of taxonicity, even for non-Gaussian admixed # # distributions, as long as they're unimodal and likelihood monotonicity # # holds. # # # #---------------------------------------------------------------------------# nada <- as.numeric(NA) # handy constant k <- ncol(data) if(k < 2) { stop("\n\nNot enough columns in data matrix: Need k >= 2.\n") } # construct 2-col matrix whose rows each contain a distinct # pair of integers (standing for variables' column numbers # in data[]), in the range 1:k n.pairs <- k * (k - 1) / 2 vbl.pairs <- matrix(nada, nrow=n.pairs, ncol=2, byrow=T) rownum <- 0 for(i in 1:(k-1)) { # for k=5, i=1, 2, 3, 4 for(j in (i+1):k) { # j=2,3,4,5 3,4,5 4,5 5 rownum <- rownum + 1 vbl.pairs[rownum, 1] <- i vbl.pairs[rownum, 2] <- j } } # delete rows of data matrix with missing values (pairwise deletion would # keep more data, but would also make it very hard to aggregate parameter # estimates for curves, etc. later on) N <- nrow(data) slct <- rep(T, N) for(i in 1:k) { slct <- slct & !is.na(data[, i]) } data <- data[slct, ] N <- nrow(data) # do MAXSLOPE once on each pair of variables (X and Y swap roles in # MAXSLOPEpair(), so no need to do swapping here) allresults <- vector(mode="list", length=n.pairs) errmsg <- "OK" for(pair in 1:n.pairs) { j <- vbl.pairs[pair, 2] i <- vbl.pairs[pair, 1] allresults[[pair]] <- MAXSLOPEpair(data, j, i, adjust=adjust, print.flag=print.flag, dbug=dbug) if(allresults[[pair]]$errmsg != "OK") {# there was an error encountered errmsg <- allresults[[pair]]$errmsg } } # average parameter estimates, curves across indicator variables (for some # parameters & curves) or indicator pairs (for other parameters) P.sum <- n.P <- rep(0, 2) mu.sum.t <- n.mu.t <- mu.sum.c <- n.mu.c <- var.sum.t <- n.var.t <- var.sum.c <- n.var.c <- cnt <- rep(0, k) llr <- matrix(0, nrow=N, ncol=k) # log(likelihood ratio) density <- matrix(0, nrow=N, ncol=3 * k) # each triple of col's is for 1 # vbl---1st is ordinate, 2nd # is taxon curve, 3rd is C.C. # curve n.density <- rep(0, k) for(pair in 1:n.pairs) { j <- allresults[[pair]]$j # output variable i <- allresults[[pair]]$i # input variable # aggregate taxon base rate estimates if(!is.na(allresults[[pair]]$P[1])) { tmp <- allresults[[pair]]$P[1] tmp <- min(max(tmp, 0), 1) # force to lie in (0, 1) if(tmp == 0) { # keep from taking bad logs tmp <- 1e-6 } else if (tmp == 1) { tmp <- 1 - 1e-6 } P.sum[1] <- P.sum[1] + log(tmp / (1 - tmp)) # log(odds ratio) better metric for # averaging than P, because # errors of estimation have more # symmetric distribution n.P[1] <- n.P[1] + 1 } if(!is.na(allresults[[pair]]$P[2])) { tmp <- allresults[[pair]]$P[2] tmp <- min(max(tmp, 0), 1) if(tmp == 0) { tmp <- 1e-6 } else if (tmp == 1) { tmp <- 1 - 1e-6 } P.sum[2] <- P.sum[2] + log(tmp / (1 - tmp)) n.P[2] <- n.P[2] + 1 } # aggregate taxon & C.C. mean estimates if(!is.na(allresults[[pair]]$mu.X[1])) { mu.sum.t[i] <- mu.sum.t[i] + allresults[[pair]]$mu.X[1] n.mu.t[i] <- n.mu.t[i] + 1 } if(!is.na(allresults[[pair]]$mu.X[2])) { mu.sum.c[i] <- mu.sum.c[i] + allresults[[pair]]$mu.X[2] n.mu.c[i] <- n.mu.c[i] + 1 } if(!is.na(allresults[[pair]]$mu.Y[1])) { mu.sum.t[j] <- mu.sum.t[j] + allresults[[pair]]$mu.Y[1] n.mu.t[j] <- n.mu.t[j] + 1 } if(!is.na(allresults[[pair]]$mu.Y[2])) { mu.sum.c[j] <- mu.sum.c[j] + allresults[[pair]]$mu.Y[2] n.mu.c[j] <- n.mu.c[j] + 1 } # aggregate taxon & C.C. within-class variance estimates tmp <- allresults[[pair]]$var.X[1] if(!is.na(tmp)) { # log(sigma) is better metric for # averaging than sigma^2, be- # cause errors of estimate are # more symmetrically distributed tmp <- max(tmp, 0) # avoid taking log of zero or neg var.sum.t[i] <- var.sum.t[i] + log(sqrt(tmp)) n.var.t[i] <- n.var.t[i] + 1 } tmp <- allresults[[pair]]$var.X[2] tmp <- max(tmp, 0) if(!is.na(tmp)) { var.sum.c[i] <- var.sum.c[i] + log(sqrt(tmp)) n.var.c[i] <- n.var.c[i] + 1 } tmp <- allresults[[pair]]$var.Y[1] tmp <- max(tmp, 0) if(!is.na(tmp)) { var.sum.t[j] <- var.sum.t[j] + log(sqrt(tmp)) n.var.t[j] <- n.var.t[j] + 1 } tmp <- allresults[[pair]]$var.Y[2] tmp <- max(tmp, 0) if(!is.na(tmp)) { var.sum.c[j] <- var.sum.c[j] + log(sqrt(tmp)) n.var.c[j] <- n.var.c[j] + 1 } # aggregation pertinent to later calculation of Bayesian posterior # probabilities, given knowledge of all indicator values for an # individual # first, we have to change from posteriors returned by MAXSLOPEpair() # to log-likelihood ratios (log of ratio is better metric for # averaging than raw likelihood, because estimation errors more # symmetrically distributed) bayeprob <- allresults[[pair]]$bayeprob if(any(is.na(bayeprob))) { cat("line 251: grabbed bayeprob from all.results[[pair]]\n") cat(" pair=", pair, ", i=", i, ", j=", j, ", bayeprob=\n", sep="") print(bayeprob) stop("I quit") } if(any(is.na(bayeprob))) { for(ii in 1:N) { if(is.na(bayeprob[ii, 1])) { bayeprob[ii, 1] <- data[ii, i] } if(is.na(bayeprob[ii, 2])) { bayeprob[ii, 2] <- 0.5 } if(is.na(bayeprob[ii, 3])) { bayeprob[ii, 3] <- data[ii, j] } if(is.na(bayeprob[ii, 4])) { bayeprob[ii, 4] <- 0.5 } } } # post. prob. given X effectively has P (taxon base rate) already # multiplied in, so we need to factor that back out, to get only # that part of post. prob. related to X, excluding anything related # to base rate. Making bayeprob[] sum to 1 across col's 2 & 4 # accomplishes this. tmp <- sum(bayeprob[, 2]) if(tmp != 0) { bayeprob[, 2] <- bayeprob[, 2] / tmp } else if(is.na(tmp)) { cat("line 283: sum(bayeprob[, 2]=", sum(bayeprob[, 2]), "\n", sep="") stop("I quit") } tmp <- sum(bayeprob[, 4]) if(tmp != 0) { bayeprob[, 4] <- bayeprob[, 4] / tmp } else if(is.na(tmp)) { cat("line 290: sum(bayeprob[, 4]=", sum(bayeprob[, 4]), "\n", sep="") stop("I quit") } if(dbug & any(is.na(bayeprob))) { cat("line 294: pair=", pair, ", i=", i, " , j=", j, " bayeprob=\n") print(bayeprob) stop("I quit") } llr[, i] <- llr[, i] + log(bayeprob[, 2] / (1 - bayeprob[, 2])) cnt[i] <- cnt[i] + 1 llr[, j] <- llr[, j] + log(bayeprob[, 4] / (1 - bayeprob[, 4])) # here we aggregate latent frequency curves, whose ordinates are already # "in sync" due to previous linear interpolation at same ordinate # values for a given input variable), no matter what output variable # was tmp <- allresults[[pair]]$density.X # N \times 3 matrix # copy cols of input density matrices into appropriate cols in density # matrix if(all(!is.na(tmp[, 2]))) { density[, (i - 1) * 3 + 1:3] <- density[, (i - 1) * 3 + 1:3] + tmp n.density[i] <- n.density[i] + 1 } tmp <- allresults[[pair]]$density.Y if(all(!is.na(tmp[, 2]))) { density[, (j - 1) * 3 + 1:3] <- density[, (j - 1) * 3 + 1:3] + tmp n.density[j] <- n.density[j] + 1 } } # now do the averaging tmp <- exp(sum(P.sum) / sum(n.P)) # average of odds ratios P <- tmp / (tmp + 1) # convert to probability mu <- vars <- matrix(nada, nrow=2, ncol=k) for(i in 1:k) { mu[1, i] <- mu.sum.t[i] / n.mu.t[i] mu[2, i] <- mu.sum.c[i] / n.mu.c[i] vars[1, i] <- (exp(var.sum.t[i] / n.var.t[i]))^2 vars[2, i] <- (exp(var.sum.c[i] / n.var.c[i]))^2 } for(i in 1:k) { if(cnt[i] > 0) { llr[i] <- llr[i] / cnt[i] # average of log likelihood ratios } } for(i in 1:k) { # average latent freq curves density[, (i - 1) * 3 + 1:3] <- density[, (i - 1) * 3 + 1:3] / n.density[i] } # calculate Bayes posterior probabilities, assuming local independence of # indicators. This isn't actually correct when b1.yx.within != 0. # Within MAXSLOPEpair(), pairwise non-independence of indicators is # adjusted for. However, when indicators (say) 1 through k are all # correlated, adjusting indicator 1 for indicator 2 doesn't make # calculated posteriors based on indicator 1 alone independent of # indicators 3, ..., k. However, the correct form of adjustment depends # not only on k-fold correlation pattern, but also form of density. Since # MAXSLOPEpair() doesn't make strong assumptions about form of density # (only requires monotonicity of likelihood ratio L(taxon)/L(C.C.) as you # move from low to high on each indicator), we don't know the correct # form of the adjustment. The independence formula is used as an # heuristic substitute, which won't be too far off if residual # intercorrelations (between, e.g., indicator1 [adjusted for indicator2] # and indicators3, ..., k) are not too large. P <- max(min(P, 0), 1) if(P == 0) { P <- 1e-6 } else if (P == 1) { P <- 1 - 1e-6 } log.prior.OR <- log(P / (1 - P)) # log(prior odds ratio) log.post.OR <- rep(log.prior.OR, N) for(i in 1:k) { log.post.OR <- log.post.OR + llr[, i] } tmp <- exp(log.post.OR) # convert to odds ratio bayeprob <- tmp / (tmp + 1) # convert to probability # pass back cumulated results as vector of lists tmp <- vector(mode="list", length=3) tmp$allresults <- allresults tmp$stats <- list(P=P, mu=mu, vars=vars, bayeprob=bayeprob, density=density) tmp$errmsg <- errmsg return( tmp ) } MAXSLOPEpair <- function(data, indx2, indx1, adjust=T, print.flag=T, dbug=F) { #===========================================================================# # MAXSLOPEpair # # Simple graphical taxometric method for taxometrics based (in part) on # # finding X-value where maximum slope of Y-on-X regression occurs. # # Author: William M. Grove # # Language: R # # Last modified: July 22, 2003 # # # # arguments # # data = data matrix # # indx2 = column number of second (first output) variable in pair # # indx1 = column number of first (first input) variable in pair # # adjust = T means adjust for (possibly) non-zero within-latent # # class slopes, # # T means assume within-class slopes are zero (the same # # way MAMBAC does) # # print.flag = T (user wants info printed) # # F (user doesn't want info printed) # # dbug = T means print intermediate output # # F means don't print it # # # # Upon completion, program returns list with components: # # i = column # of one ("first") indicator # # j = column # of other indicator (note: j > i always) # # reg.yx.adj = regression of Y on X, with effect of (assumed linear, # # assumed common to both taxon & complement class) within- # # class regression adjusted out (of spline-smoothed # # regression) # # slope.yx = slope of regression of Y on X, at X=x, based on spline- # # smoothed regression # # reg.xy.adj = regression of X on Y, with effect of (assumed linear, # # assumed common to both taxon & complement class) within- # # class regression adjusted out (of spline-smoothed # # regression) # # slope.xy = slope of regression of X on Y, at Y,=y based on spline- # # smoothed regression # # max.slope = value of smoothed slope at maximum, NOT location of # # maximum (2-element vector, 1st is for slope.ji, 2nd for # # slope.ij) # # maxslope.pt = value of indicator where conditional slope is # # is maximized (2-element vector, 1st is where slope.ji # # is maximized on indicator i, 2nd is where slope.ij is # # maximized on indicator j) # # hitmax.cut = location of HITMAX cut (2-element vector, 1st is for # # indicator i, 2nd for indicator j) # # P = estimates of big P (taxon base rate), first based on # # behavior of Y (indic. j), second based on that of X # # (indic. i) # # mu.X = taxon & C.C. means on X (indicator i) # # mu.Y = taxon & C.C. means on Y (indicator j) # # var.X = taxon & C.C. variances on X (indicator i) # # var.Y = taxon & C.C. variances on Y (indicator j) # # b1.within = within-class regression slope as 2-element vector (1st # # for Y on X, 2nd for X on Y) # # bayeprob = Bayes posterior probability of taxon membership, given # # knowledge of single variable (4 col matrix, 1st col is # # X-value, 2nd col is post. prob. conditional on knowing # # observation's X-value, 3rd col is Y-value, 4th col is # # post. prob. conditional on knowing observation's # # Y-value # # density.X = estimated latent frequency curves---col 1= X-values # # (indicator i), col 2= curve for taxon, col 3= for C.C. # # density.Y = est'd latent freq curves---col1=Y-values (indic. j), # # col 2= curve for taxon, col3= for C.C. # # errmsg = text message, explaining any error encountered # # # #===========================================================================# nada <- as.numeric(NA) # handy constant N <- nrow(data) if(dbug) { cat("at top of MAXSLOPEpair() routine\n") } # Documentary note added 1/15/2004: # All references to step numbers in code below are based on original version # of MAXSLOPE paper. Final version has substantially changed steps and # step numbers for the algorithm! Code, however, does in fact carry out # exactly the algorithm described in final version of paper, except for # carrying out calculation for last consistency test mentioned (the one # that uses a Pearson correlation between pairs of slope plots, when # there are k >= 3 variables). # Steps 1--7 of procedure for obtaining MAXSLOPE curve + parameter estimates # are in MAXSLOPExy() result.yx <- MAXSLOPEyx(data, indx2, indx1, adjust=adjust, print.flag=print.flag, dbug=dbug) # regress Y on X if(result.yx$errmsg == "OK") { regr.yx.adj <- result.yx$regr.yx.adj slope.yx <- result.yx$slope.yx b1.yx.within <- result.yx$b1.yx.within maxslope.pt.X <- result.yx$maxslope.pt.X max.slope.X <- result.yx$max.slope.X hitmax.cut.X <- result.yx$hitmax.cut.X mu.cY.biased <- result.yx$mu.cY.biased mu.tY.biased <- result.yx$mu.tY.biased # the bias in above equals $b_1 \mu_{cX}$ ($b_1 \mu_{tX}$) } else { regr.yx.adj <- slope.yx <- cbind(data[, indx2], rep(NA, N)) } # Step 9 --- \item Reverse roles of X and Y; re-do steps 1 through 8, # and thereby estimate $\delta_X$ (and related stuff) result.xy <- MAXSLOPEyx(data, indx1, indx2, adjust=adjust, print.flag=print.flag, dbug=dbug) # regress X on Y if(result.xy$errmsg == "OK") { regr.xy.adj <- result.xy$regr.yx.adj slope.xy <- result.xy$slope.yx b1.xy.within <- result.yx$b1.yx.within maxslope.pt.Y <- result.xy$maxslope.pt.X max.slope.Y <- result.xy$max.slope.X hitmax.cut.Y <- result.xy$hitmax.cut.X mu.cX.biased <- result.xy$mu.cY.biased mu.tX.biased <- result.xy$mu.tY.biased # the bias in above equals $b_1 \mu_{cY}$ ($b_1 \mu_{tY}$) } else { regr.xy.adj <- slope.xy <- cbind(data[, indx1], rep(NA, N)) } if(result.yx$errmsg != "OK" | result.xy$errmsg != "OK") { if(result.yx$errmsg != "OK") { errmsg <- result.yx$errmsg } else { errmsg <- result.xy$errmsg } # construct bayeprob[] such that missing info from a MAXSLOPE pair # (i, j) won't screw up calculation of overall Bayes posterior # prob's of taxon membership, in MAXSLOPE(): x <- data[, indx1] y <- data[, indx2] pp.x <- pp.y <- rep(0.5, N) bayeprob <- cbind(x, pp.x, y, pp.y) if(dbug) { cat("about to do ERROR return from MAXSLOPEpair():\n") cat("result.yx$errmsg=\n") cat(" ", result.yx$errmsg, "\n", sep="") cat(" i=", indx1, ", j=", indx2, ", bayeprob=\n", sep="") cat("result.xy$errmsg=\n") cat(" ", result.xy$errmsg, "\n", sep="") print(bayeprob) } # we can't proceed with parameter estimation, so return with error return( list(i=indx1, j=indx2, reg.yx.adj=regr.yx.adj, slope.yx=slope.yx, reg.xy.adj=regr.yx.adj, slope.xy=slope.xy, maxslope.pt=rep(nada, 2), max.slope=rep(nada, 2), hitmax.cut=rep(nada, 2), P=rep(nada, 2), mu.X=rep(nada, 2), mu.Y=rep(nada, 2), var.X=rep(nada, 2), var.Y=rep(nada, 2), b1.within=rep(nada, 2), bayeprob=bayeprob, density.X=matrix(nada, nrow=N, ncol=3), density.Y=matrix(nada, nrow=N, ncol=3), errmsg=errmsg) ) } # Step 8' ---\item Remove asymptotic biases in $\mu$ estimates, which # involves summing even vs. odd powers in infinite series in $b_1$ (see # brief derivation in file # '~/wp/MSs/MAXSLOPE-vs-HITMAX-cut/bias_correction.tex' for some more info # on this. The derivation of the formulas, and this algorithmic step, are # not discussed in submitted version of MAXSLOPE paper). The series # converges iff abs(b1) < 1, so if this condition not satisfied, we use a # one-step bias correction to remove much, but not all, of bias. (Note: # if data are standardized before being sent to this routine, the # condition will be satisfied.) if(dbug) { cat("line 533: removing biases in mu estimates\n") } if(dbug) { cat("mu.cY.biased=", mu.cY.biased, "\n", sep="") cat("mu.tY.biased=", mu.tY.biased, "\n", sep="") cat("mu.cX.biased=", mu.cX.biased, "\n", sep="") cat("mu.tX.biased=", mu.cX.biased, "\n", sep="") } flag.Y <- flag.X <- F r <- b1.yx.within # "r" to keep formulas compact if(abs(r) < 1) { if(r != 0) { c.o <- (1 / (1 / r^2 - 1)) * (1 / (1 - r) - 1) # odd power terms in geom. series c.e <- c.o / r # even power terms in geom. series mu.cY <- mu.cY.biased + c.o * mu.cX.biased + c.e * mu.cY.biased mu.tY <- mu.tY.biased + c.o * mu.tX.biased + c.e * mu.tY.biased } else { # no correction needed mu.cY <- mu.cY.biased mu.tY <- mu.tY.biased } flag.Y <- F } else { # one-step correction mu.cY <- mu.cY.biased + r * mu.cX.biased mu.tY <- mu.tY.biased + r * mu.tX.biased flag.Y <- T } r <- b1.xy.within if(abs(r) < 1) { if(r != 0) { c.o <- (1 / (1 / r^2 - 1)) * (1 / (1 - r) - 1) c.e <- c.o / r mu.cX <- mu.cX.biased + c.o * mu.cY.biased + c.e * mu.cX.biased mu.tX <- mu.tX.biased + c.o * mu.tY.biased + c.e * mu.tX.biased } else { mu.cX <- mu.cX.biased mu.tX <- mu.tX.biased } flag.X <- F } else { mu.cX <- mu.cX.biased + b1.xy.within * mu.cY.biased mu.tX <- mu.tX.biased + b1.xy.within * mu.tY.biased flag.X <- T } if(dbug) { cat("mu.cX=", mu.cX, "\n", sep="") cat("mu.tX=", mu.tX, "\n", sep="") } if(flag.Y & !flag.X) { errmsg <- "mu.cY, mu.tY biased" } else if(!flag.Y & flag.X) { errmsg <- "mu.cX, mu.tX biased" } else if(flag.Y & flag.X) { errmsg <- "mu.cY, mu.tY, mu.cX, mu.tX biased" } else { errmsg <- "OK" } if(!is.na(mu.tY) & !is.na(mu.cY)) { delta.Y <- mu.tY - mu.cY } else { delta.Y <- 0 } if(!is.na(mu.tX) & !is.na(mu.cX)) { delta.X <- mu.tX - mu.cX } else { delta.X <- 0 } # re-estimate HITMAX cuts, now that we have better estimates of mu's if(dbug) { cat("line 593: re-estimating HITMAX cuts\n") } ixy <- cbind(1:N, data[, indx1], data[, indx2]) # i=original row number, # x=x, # y=y ord <- order(ixy[, 2]) # sort on X ixy <- ixy[ord, ] # extra 1st col is to keep track # of original order of rows if(dbug & any(is.na(ixy[, 2]))) { cat("line 603: y=\n") print(ixy[, 2]) stop("I quit") } Y.hitmax <- (mu.cY + mu.tY) / 2 options(warn=-1) y <- ixy[, 3] x <- ixy[, 2] reg.yx.gam <- gam(y ~ s(x, k=8, bs="cr")) options(warn=0) y.adj <- predict(reg.yx.gam) - b1.yx.within * x # this will get used for finding # asymptote, adjusting HITMAX # cut estimate, and estimating # Bayes posteriors ixya <- cbind(ixy, y.adj) # i=original row number, # x=x, # y=y, # a=y.adj rm(x, y, y.adj) # linearly interpolate to find HITMAX cut on X ord <- order(ixya[, 4]) # sort on adjusted Y ixya <- ixya[ord, ] if(Y.hitmax < ixya[1, 4]) { # BUG here: Y.hitmax missing (VERY RARELY) i0 <- 1 } else { i0 <- max((1:N)[ixya[, 4] < Y.hitmax]) } if(Y.hitmax > ixya[N, 4]) { i1 <- N } else { i1 <- min((1:N)[ixya[, 4] > Y.hitmax]) } if(i0 != i1) { hitmax.cut.X <- ixya[i0, 2] + (Y.hitmax - ixya[i0, 4]) / (ixya[i1, 4] - ixya[i0, 4]) * (ixya[i1, 2] - ixya[i0, 2]) } else { hitmax.cut.X <- ixya[i0, 2] } X.hitmax <- (mu.cX + mu.tX) / 2 options(warn=-1) x <- ixya[, 2] ; y <- ixya[, 3] reg.xy.gam <- gam(x ~ s(y, k=8, bs="cr")) options(warn=0) x.adj <- predict(reg.xy.gam) - b1.xy.within * y # will get used for finding # asymptote, adjusting HITMAX # cut estimate, and estimating # Bayes posteriors ixyaa <- cbind(ixya, x.adj) # i=original row number, # x=x, # y=y, # a=y.adj, # a=x.adj rm(x, y, x.adj) # linearly interpolate to find HITMAX cut on Y ord <- order(ixyaa[, 5]) # sort on adjusted X ixyaa <- ixyaa[ord, ] if(X.hitmax < ixyaa[1, 5]) { i0 <- 1 } else { i0 <- max((1:N)[ixyaa[, 5] < X.hitmax]) } if(X.hitmax > ixyaa[N, 5]) { i1 <- N } else { i1 <- min((1:N)[ixyaa[, 5] > X.hitmax]) } if(i0 != i1) { hitmax.cut.Y <- ixyaa[i0, 3] + (X.hitmax - ixyaa[i0, 5]) / (ixyaa[i1, 5] - ixyaa[i0, 5]) * (ixyaa[i1, 3] - ixyaa[i0, 3]) } else { hitmax.cut.Y <- ixyaa[i0, 2] } # Step 10 of original algorithm replaced with _much_ simpler approach: # Step 10' ---\item solve linear equations for $P$: # $\mu_Y = Q \mu_{cY} + P \mu_{tY}$ and # $\mu_X = Q \mu_{cX} + P \mu_{tX}$ mu.X <- mean(ixyaa[, 2]) mu.Y <- mean(ixyaa[, 3]) if(!is.na(mu.cY) & delta.Y != 0) { P.y <- (mu.Y - mu.cY) / delta.Y } else { P.y <- 0 } if(!is.na(mu.cX) & delta.X != 0) { P.x <- (mu.X - mu.cX) / delta.X } else { P.x <- 0 } P <- (P.y + P.x) / 2 Q <- 1 - P # Step 11 is new (i.e., not discussed in submitted version of MAXSLOPE # paper): # Step 11 ---\item estimate $\sigma^2_{cY}, \sigma^2_{tY}, \sigma^2_{cY}, # \sigma^2_{tY}$ by solving equations given below at *** # First, we have to find variances of Y at HITMAX cut on X, and vice # versa. This is done in old algorithm's Step 8. # Step 8 --- \item Using constant-width narrow interval(s) on X, examine # smoothed conditional variance of Y as function of X, # You can either (a) focus only on interval around # hitmax.cut just determined, or (b) look at whole # conditional variance function and find its peak. # If you do both, and they agree (to a certain # tolerance), this is evidence you have good estimate of # hitmax.cut. # In any case, estimate value of conditional variance of # $Y$, at (one or two estimated) HITMAX cut(s). If you have # two cuts, and the conditional variances at these cuts # differ (by more than a certain tolerance), you have a # problem---essentially, you just failed a consistency # test. I haven't decided whether that should kill the # procedure, let alone what the tolerances should be. # Just to get this up and running, we program (a) above; # i.e., just look at interval around previously re-estimated # hitmax cut. # start by locating location of X-value nearest to HITMAX cut on X ord <- order(ixyaa[, 2]) # sort on X ixyaa <- ixyaa[ord, ] if(hitmax.cut.X < ixyaa[1, 2]) { i0 <- 1 } else { i0 <- max((1:N)[ixyaa[, 2] <= hitmax.cut.X]) } if(hitmax.cut.X > ixyaa[N, 2]) { i1 <- N } else { i1 <- max((1:N)[ixyaa[, 2] >= hitmax.cut.X]) } x.indx <- round((i0 + i1) / 2) # calculate raw conditional variance function (i.e., variance of Y as a # function of X), in as narrow an interval as possible, i.e.: bin.n <- 3 # arbitrary choice, would be better # if we used cross-validated # estimate of bin width bin.n2 <- (bin.n - 1) / 2 var.cond <- rep(nada, N - 2) for(i in (bin.n2 + 1):(N - bin.n2)) { y.in.bin <- ixyaa[(i - bin.n2):(i + bin.n2), 3] var.cond[i - 1] <- var(y.in.bin) } # smooth it, then find smoothed value at X-value nearest hitmax.cut on X var.cond.smu <- supsmu((2:(N - 1)), var.cond) var.Y.hitmax <- var.cond.smu$y[x.indx - 1] # Now, find location of Y-value nearest to hitmax.cut on Y ord <- order(ixyaa[, 3]) # sort on Y ixyaa <- ixyaa[ord, ] if(hitmax.cut.Y > ixyaa[N, 3]) { i0 <- N } else { i0 <- max((1:N)[ixyaa[, 3] <= hitmax.cut.Y]) } if(hitmax.cut.Y < ixyaa[1, 3]) { i1 <- 1 } else { i1 <- min((1:N)[ixyaa[, 3] >= hitmax.cut.Y]) } y.indx <- round((i0 + i1) / 2) # calculate raw conditional variance function (i.e., variance of X as a # function of Y), in as narrow an interval as possible, i.e.: var.cond <- rep(nada, N - 2) for(i in (bin.n2 + 1):(N - bin.n2)) { x.in.bin <- ixyaa[(i - bin.n2):(i + bin.n2), 2] var.cond[i - 1] <- var(x.in.bin) } # smooth it, then find smoothed value at Y-value nearest hitmax.cut on Y var.cond.smu <- supsmu(2:(N - 1), var.cond) var.X.hitmax <- var.cond.smu$y[y.indx - 1] # now, solve the following linear equations for the within-taxon & # within-C.C. variances (alas, only enough equations here to fit # constants, not enough for consistency tests or even goodness-of-fit # tests, at this point). The following calculation assumes that # P != 1/2, else equations are indeterminate. x <- ixyaa[, 2] if(P != 1/2) { # solve linear equations # Ú ¿ Ú ¿ Ú ¿ # ³ z ³ ³ 1/2 1/2 ³ ³ x1 ³ # ³ ³ = ³ ³ ³ ³ # ³ w ³ ³ P Q ³ ³ x2 ³ # À Ù À Ù À Ù # by substitution: z <- var.X.hitmax - 1/4 * delta.X^2 w <- var(x) - P * Q * delta.X^2 x1 <- (w - 2 * P * z) / (Q - P) x2 <- 2 * z - x1 # assign the solution values to names we want: var.tX <- x1 var.cX <- x2 } else { var.cY <- var.tY <- nada } rm(x) y <- ixyaa[, 3] if(P != 1/2) { z <- var.Y.hitmax - 1/4 * delta.Y^2 w <- var(y) - P * Q * delta.Y^2 x1 <- (w - 2 * P * z) / (Q - P) x2 <- 2 * z - x1 var.tY <- x1 var.cY <- x2 } else { var.cY <- var.tY <- nada } rm(y) # now we estimate Bayes posterior probabilities of taxon membership, needed # for estimating latent frequency curves. # "pp.x" is Bayes posterior probability of taxon membership, given knowledge # only of X (and base rate, which is implied in the regression formula # used to compute it) # here is formula for posterior probability of taxon membership, as a # function of X, based on unsmoothed, adjusted Y-values: # pp.x <- (y.adj2 - mu.cY) / (mu.tY - mu.cY) # where y.adj2 <- y - b1.yx.within * x # p.p.'s based on smoothed, adjusted Y's are better behaved if(dbug) { cat("line 830: estimating Bayes posterior Pr | X\n") } ord <- order(ixyaa[, 2]) # sort on X ixyaa <- ixyaa[ord, ] options(warn=-1) y.adj <- ixyaa[, 4] ; x <- ixyaa[, 2] reg.yadjx.gam <- gam(y.adj ~ s(x, k=8, bs="cr")) options(warn=0) pp.x <- (predict(reg.yadjx.gam) - mu.cY) / (mu.tY - mu.cY) if(dbug & any(is.na(pp.x))) { cat("line 840: predict(reg.yadjx.gam)=\n") print(predict(reg.yadjx.gam)) cat("mu.cY=", mu.cY, ", mu.tY=", mu.tY, "\n", sep="") cat("pp.x=\n") print(pp.x) stop("I quit") } pp.x[pp.x < 0] <- 0 pp.x[pp.x > 1] <- 1 # the generation of pp.x is done by procedure that eliminates duplicate # values of the input vector X, of which pp.x is a function. Hence, # we have to put in duplicated pp.x[i] values to match eliminated # duplicate x[i] values. Ordinarily we don't go through floating # point vectors testing for exact equality, but in this case that's # exactly what we want. if(length(pp.x) < length(x)) { # duplicates were eliminated if(dbug) { cat("line 858: dealing with duplicate X values, w.r.t. Bayes posterior\n") } tmp <- rep(nada, N) tmp[1] <- pp.x[1] i <- 2 j <- 1 repeat { if(x[i] != x[i - 1]) { j <- j + 1 } tmp[i] <- pp.x[j] i <- i + 1 if(i > N) { break } } pp.x <- tmp } # now estimate latent freq curve for X if(dbug) { cat("line 879: estimating latent frequency curve for X\n") } ixyaap <- cbind(ixyaa, pp.x) # i=original row number, # x=x, # y=y, # a=y.adj, # a=x.adj, # p=pp.x rm(y.adj, x, pp.x) if(is.R()) { density.X <- density(ixyaa[, 2], kernel="optcosine") } else { density.X <- density(ixyaa[, 2], N=N, window="cosine", width="sj") } # desired density is product of density.X$y and pp.x[]---have to get their # corresponding ordinate values in sync by interpolation at matching # points on ordinate: min.x <- max(min(ixyaap[, 2]), min(density.X$x)) # outer max() ensures we don't # extrapolate max.x <- min(max(ixyaap[, 2]), max(density.X$x)) # as does outer min() here xout.X <- seq(min.x, max.x, length=N) options(warn=-1) ap.X <- approx(x=ixyaap[, 2], y=ixyaap[, 6], xout=xout.X) # interpolated posterior prob's ad.X <- approx(x=density.X$x, y=density.X$y, xout=xout.X) # interpolated density ad.X$y <- ad.X$y / sum(ad.X$y) options(warn=0) density.tX <- N * ap.X$y * ad.X$y density.cX <- N * (1 - ap.X$y) * ad.X$y # now calculate Bayes posterior probability of taxon membership, given Y, # used to estimate Y's latent frequency curve if(dbug) { cat("line 914: estimating Bayes posterior Pr | Y\n") } ord <- order(ixyaap[, 3]) # sort on Y ixyaap <- ixyaap[ord, ] options(warn=-1) x.adj <- ixyaap[, 5] ; y <- ixyaap[, 3] reg.xadjy.gam <- gam(x.adj ~ s(y, k=8, bs="cr")) options(warn=0) pp.y <- (predict(reg.xadjy.gam) - mu.cX) / (mu.tX - mu.cX) if(dbug & any(is.na(pp.y))) { cat("line 928: predict(reg.xadjy.gam)=\n") print(predict(reg.xadjy.gam)) cat("mu.cX=", mu.cX, ", mu.tX=", mu.tX, "\n", sep="") cat("pp.y=\n") print(pp.y) stop("I quit") } pp.y[pp.y < 0] <- 0 pp.y[pp.y > 1] <- 1 # the generation of pp.y is done by procedure that eliminates duplicate # values of the input vector Y, of which pp.y is a function. Hence, # we have to put in duplicated pp.y[i] values to match eliminated # duplicate y[i] values. Ordinarily we don't go through floating # point vectors testing for exact equality, but in this case that's # exactly what we want. if(length(pp.y) < length(y)) { # duplicates were eliminated if(dbug) { cat("line 944: dealing with duplicate Y values, w.r.t. Bayes posterior\n") } tmp <- rep(nada, N) tmp[1] <- pp.y[1] i <- 2 j <- 1 repeat { if(y[i] != y[i - 1]) { j <- j + 1 } tmp[i] <- pp.y[j] i <- i + 1 if(i > N) { break } } pp.y <- tmp } # now estimate latent freq curve for Y if(dbug) { cat("line 965: estimating latent frequency curve for Y\n") } ixyaapp <- cbind(ixyaap, pp.y) # i=original row number, # x=x, # y=y, # a=y.adj, # a=x.adj, # p=pp.x, # p=pp.y rm(x.adj, y, pp.y) if(is.R()) { density.Y <- density(ixyaap[, 3], kernel="optcosine") } else { density.Y <- density(ixyaap[, 3], N=N, window="cosine", width="sj") } # desired density is product of density.Y$y and pp.y[]---have to get their # corresponding ordinate values in sync by interpolation at matching # points on ordinate: min.y <- max(min(ixyaapp[, 3]), min(density.Y$x)) max.y <- min(max(ixyaapp[, 3]), max(density.Y$x)) xout.Y <- seq(min.y, max.y, length=N) options(warn=-1) ap.Y <- approx(x=ixyaapp[, 3], y=ixyaapp[, 7], xout=xout.Y) ad.Y <- approx(x=density.Y$x, y=density.Y$y, xout=xout.Y) ad.Y$y <- ad.Y$y / sum(ad.Y$y) options(warn=0) density.tY <- N * ap.Y$y * ad.Y$y density.cY <- N * (1 - ap.Y$y) * ad.Y$y # now we need to put posterior probability vectors back into row order # corresponding to order of rows of X (and Y) when they were input, at # top of routine. (The calculation of pp.x[] and pp.y[] was done from # output of supsmu(), which sorts on input variable and eliminates # exact duplicates on independent variable. Since there may have been # duplicate X (or Y) values, we can't just "unsort" pp.x[] and pp.y[], # alas; instead, we have to have kept track of the changing position of # each element of our array[original row, x, y, ...], as it kept # getting sorted and resorted. if(dbug) { cat("line 1007: re-ordering rows of pp.x, pp.y\n") } ord <- order(ixyaapp[, 1]) ixyaapp <- ixyaapp[ord, ] # back into original order bayeprob <- cbind(ixyaapp[, 2], ixyaapp[, 6], ixyaapp[, 3], ixyaapp[, 7]) # x, pp.x, y, pp.y columns if(dbug & any(is.na(bayeprob))) { cat("line 1015: ") if(any(is.na(bayeprob[, 1]))) { cat("x=\n") print(bayeprob[, 1]) } else { cat("none of bayeprob[, 1] is NA\n") } if(F & any(is.na(bayeprob[, 2]))) { cat("pp.x=\n") print(bayeprob[, 2]) } else { cat("none of bayeprob[, 2] is NA\n") } if(any(is.na(bayeprob[, 3]))) { cat("y=\n") print(bayeprob[, 3]) } else { cat("none of bayeprob[, 3] is NA\n") } if(any(is.na(bayeprob[, 4]))) { cat("pp.y=\n") print(bayeprob[, 4]) } else { cat("none of bayeprob[, 4] is NA\n") } stop("I quit") } if(dbug) { cat("about to return from MAXSLOPEpair():\n") cat(" i=", indx1, ", j=", indx2, ", bayeprob=\n", sep="") print(bayeprob) } return( list(i=indx1, j=indx2, reg.yx.adj=regr.yx.adj, slope.yx=slope.yx, reg.xy.adj=regr.yx.adj, slope.xy=slope.xy, maxslope.pt=c(maxslope.pt.X, maxslope.pt.Y), max.slope=c(max.slope.X, max.slope.Y), hitmax.cut=c(hitmax.cut.X, hitmax.cut.Y), P=c(P.y, P.x), # order is reversed because we're # basing P estimate on behavior # of output variable mu.X=c(mu.tX, mu.cX), mu.Y=c(mu.tY, mu.cY), var.X=c(var.tX, var.cX), var.Y=c(var.tY, var.cY), b1.within=c(b1.yx.within, b1.xy.within), bayeprob=bayeprob, density.X=cbind(xout.X, density.tX, density.cX), density.Y=cbind(xout.Y, density.tY, density.cY), errmsg=errmsg) ) } MAXSLOPEyx <- function(data, indx.y, indx.x, adjust=T, print.flag=T, dbug=F) { #---------------------------------------------------------------------------# # MAXSLOPEyx()---- MAXSLOPE analysis "one way" only, for $Y$ regressed on # # $X$ # # Author: William M. Grove # # Language: R # # Last modified: July 22, 2003 # # # # args: # # data = matrix (N X k) of data # # indx.y = number of col in data[, ] to be used as output variable # # indx.x = number of col for input variable # # adjust = T means adjust for within-class regression slopes, # # F means do not adjust (pretend within-class slopes are # # all zero) # # print.flag = T (interactive use) prints graphs, messages to screen # # F (batch/simulation use) doesn't print that stuff # # Note: in either case, all main results are returned by # # routine. # # dbug = T means print intermediate ouptut # # F means do not print it # # # # returns list with components: # # j = number of col in data[, ] used as output variable # # i = ditto, for input variable # # regr.yx.adj = regression of Y on X, with effect of (assumed linear, # # assumed common to both taxon & complement class) # # within-class regression adjusted out # # slope.yx = slope of regression of Y on X, at X, using smoothed # # regression # # maxslope.pt.X = estimate of MAXSLOPE point on X # # max.slope = max of slope values across entire range of X # # hitmax.cut.X = estimate of HITMAX cut on X # # b1.yx.within = estimate of (assumed linear, assumed common to both # # both taxon & complement class) within-class re- # # gression slope # # mu.cY.biased = (biased) estimate of complement class mean on Y # # mu.tY.biased = (biased) estimate of complement class mean on Y # # errmsg = test message, explaining any error encountered # #---------------------------------------------------------------------------# errmsg <- "OK" nada <- as.numeric(NA) # handy constant x <- data[, indx.y] y <- data[, indx.x] N <- nrow(data) ord <- order(x) x <- x[ord] y <- y[ord] # sort columns by $X$-values # robust nonlinear estimate of local regression. We use generalized additive # model based on smoothing (by default, 4th-degree) splines, because this # gives smooth (at least) 1st 2 derivatives---which is handy for easily # finding MAXSLOPE point options(warn=-1) reg.gam <- gam(y ~ s(x, k=8, bs="cr")) # k=8 derived from experimentation # with numerous Gaussian mixture # regressions; we make sure # cubic spline basis is used, # because thin plate reg spline # gets _very_ slow if N is # real large options(warn=0) lower <- predict(reg.gam, newdata=data.frame(x=x-0.0005)) upper <- predict(reg.gam, newdata=data.frame(x=x+0.0005)) slope <- (upper - lower) / 0.001 if(print.flag) { win.graph() plot(x, y, xlab=paste("Indicator ", indx.x, sep=""), ylab=paste("Indicator ", indx.y, sep=""), type="n") lines(x, predict(reg.gam), lty=1) title(paste("Regression of Indicator ", indx.y, " on Indicator ", indx.x, sep="")) xmin <- min(x) xmax <- max(x) ymin <- min(y) ymax <- max(y) xcorner <- (xmax - xmin) * (3/5 + 2/3) / 2 + xmin ycorner <- (ymax - ymin) * 1/10 + ymin legend(xcorner, ycorner, "Smoothed Regression", cex=2/3, lty=1) # plot slope, computed halfway between adjacent $X$-points win.graph() plot(x, slope, xlab=paste("Indicator ", indx.x, sep=""), ylab="Raw Slope", type="n", # ylim=c(min(min(slope), min(y)), max(max(slope), max(y))) ) lines(x, slope, lty=1) title(paste("Slope Plot: Indicator ", indx.y, " on Indicator ", indx.x, sep="")) } slope.yx <- cbind(x, slope) # find MAXSLOPE point, between 30th %ile and 97.5th %ile of X (for Gaussian # mixture regressions and P between 1/10 and 1/2, true MAXSLOPE point is # always well within this interval) intvl <- round(0.3 * N):round(0.975 * N) n.intvl <- length(intvl) max.of.slope <- max(slope.yx[intvl, 2]) # i.e., max within stated bounds maxslope.pt.X <- (slope.yx[intvl, 1])[slope.yx[intvl, 2]==max.of.slope] # xxxxx need good statistic to quantify degree to which slope plot, and # MAXSLOPE point compared to other points on that plot (in particular), # resemble shape one would expect if there's a taxon---i.e., 'sombrero' # shape (at least to extent that there's 'big' central peak and # [relatively] flat ends, or at least one relatively flat end [lower end, # if P < .5]), vs. shapes you'd expect if no taxon---i.e., anything else, # but especially: (a) monotone slope of slope plot, or (b) approximately # zero slope of slope plot if(length(maxslope.pt.X) > 1) { # MAXSLOPE point ain't unique indices <- (1:n.intvl)[slope.yx[intvl, 2] == max.of.slope] dip <- F for(i in min(indices):max(indices)) { if(slope.yx[i, 2] < max.of.slope) { dip <- T # there's a saddle point break } } if(!dip) { maxslope.pt.X <- median(maxslope.pt.X) } else { if(F & dbug) { cat("about to do error return because NO UNIQUE MAXSLOPE pt\n") cat("length(maxslope.pt.X)=", length(maxslope.pt.X), "\n", sep="") cat("MAXSLOPE pt indices=\n") print(indices) cat("\n") } return( list(j=indx.y, i=indx.x, regr.adj=matrix(nada, nrow=N, ncol=2), max.slope=max(slope.yx), maxslope.pt.X=nada, hitmax.cut.X=nada, b1.yx.within=nada, mu.cY.biased=nada, mu.tY.biased=nada, errmsg="NO UNIQUE MAXSLOPE POINT") ) } } ms.indx.lo <- max((1:n.intvl)[x[intvl] <= maxslope.pt.X]) ms.indx.hi <- min((1:n.intvl)[x[intvl] >= maxslope.pt.X]) maxslope.indx <- round((ms.indx.lo + ms.indx.hi) / 2) + intvl[1] - 1 max.slope <- max(slope.yx) # max of slopes, across range of X # ESTIMATING HITMAX CUT and main mixture model parameters (see Grove # (submitted) MAXSLOPE paper for explanation \& equations) # Step 1 --- \item Plot the conditional regression slope of $Y$ on $X$, viz. # $b_{yx}$, as a function of $X$. [already done, above] if(adjust) { # Step 2 --- \item Determine asymptotes of conditional slope, which by # hypothesis is the same at each end of the plot. # # estimate within-class linear reg slope at low end of X-distribution, # using part of data likely to have nearly linear regression of Y on X # (i.e., part of data that is likely to be [almost all] complement # class): n.lo <- round(.1 * N) if(dbug) { cat("in MAXSLOPEyx(), n.lo=", n.lo, "\n", sep="") cat("y[1:n.lo]=\n") print(y[1:n.lo]) cat("\n") cat("x[1:n.lo]=\n") print(x[1:n.lo]) cat("\n") } b1 <- coef(lm(y[1:n.lo] ~ x[1:n.lo]))[2] if(is.na(b1)) { # this happens once in a blue moon b1 <- 0 # with this r.n.g., when there # are massively tied (to 5 # decimal places) x-values, # in which case zero is indeed # the correct value } # sometimes this will deliver wacky values, so make sure it's within a # plausible interval, namely between zero and the overall linear # regression slope b1.yx.within <- min(max(b1, 0), coef(lm(y ~ x))[2]) # Step 3 --- \item Subtract asymptote value, obtained in step 2, from # every point on the conditional slope plot. Omitted as # unnecessary, in revised version of algorithm. # slope.adj <- slope.yx[, 2] - b1.yx.within # Step 4 --- \item Numerically integrate slope function, to produce # transformed mixed-population regression function # $\text{E}'_m[Y\vert X]$, taking constant of integration # to be a convenient value, viz., zero. This function # will have approximately the shape of an ogive, and # will be essentially horizontal at both left and right # ends (if within-taxon & within-CC regression slopes # are identical) # Y in regression model, adjusted for within-population linear regression # of Y on X (slopes assumed equal in both populations). Just subtract # off linear component---gives result equivalent to finding derivative # of nonlinear regression, subtracting linear within-pop. slope, and # then integrating result. This is not quite correct adjustment (esp. # for taxon observations), yielding a bias in resulting estimates of # mu.tY and, to lesser extent, mu.cY. This bias will be taken care of # in the calling routine. y.adj <- y - b1.yx.within * x } else { n.lo <- round(.1 * N) b1.yx.within <- 0 y.adj <- y } # Steps 5 to 8 of the paper are replaced with the following steps (the steps # as originally given failed to deal with asymptotic bias of $\delta_X, # $delta_Y$), and were computationally clumsy as well: # Step 5' ---\item By inspection of transformed (integrated) regression # function, determine value # $\text{E}'_m[y\vert X=x_{\text{min}}]$, i.e., mean value # of $Y$ corresponding to the minimum value(s) of $X$, # _assumed to be at low end of $X$_. This estimates # $\mu_{0Y}$, the complement class mean on $Y$, with # known asymptotic bias, $-b_1 \mu_{0X}$. The bias will # be removed in the calling routine. # Note: we could fit a suitable mixture regression model to estimate P, # sigma^2, mu.cY, mu.tY, mu.cX, mu.tX, and b1.yx.within, but (a) this # would require us to assume a specific distributional form for X- and # Y-distributions. Then instead of testing a general (within-class # unimodal) taxonic model, we'd be testing a specific distributional # admixture model, which would mix up our main conjecture with # auxiliaries---which is bad from theory-testing point of view. So we # don't want to do that. Hence, we instead estimate just some mixture # parameters here---in particular, the Y-asymptote at low end of # X-distribution, and Y-asymptote at high end of X-distribution. Both are # estimated by taking local means in these regions of X. Experimentation # with Gaussian and gamma mixtures with various taxon base rates from 1/3 # down to 1/10, and separations in the 2-3 sigma range, suggests the # following heuristic size of region for taking local mean: # n.lo <- max(10, N * .05) # which is why this number was used, above. # the local mean at low end of X is an asymptotically biased estimator of # the true C.C. mean, due to contamination from taxon, but mostly due to # inexactitude of adjustment of Y for within-class regression. This bias # will be (asymptotically) removed in calling routine. mu.cY.biased <- mean(y.adj[1:n.lo]) n.hi <- N - n.lo + 1 mu.tY.biased <- mean(y.adj[n.hi:N]) # Step 6 --- \item This step was eliminated, in revised version of algorithm # Step 7' --- \item Find value of $X$, on de-tilted mixed-population # regression function, corresponding to # $Y=(1/2)\text{E}'_m[y\vert X=x_{\text{max}}]$. This is # (estimated) HITMAX cut. Because it's based on biased # estimates of mu.tY and mu.cY, it also is biased. This # bias will be removed in the calling routine. hitmax.y <- (mu.tY.biased + mu.cY.biased) / 2 # later to be adjusted, when better # estimates of mu's available ord <- order(y.adj) yx <- cbind(y.adj[ord], x[ord]) lower <- max((1:N)[yx[, 1] <= hitmax.y]) upper <- min((1:N)[yx[, 1] > hitmax.y]) # linearly interpolate to get hitmax cut on X hitmax.cut.X <- yx[lower, 2] + (yx[upper, 2] - yx[lower, 2]) * ((hitmax.y - yx[lower, 1]) / (yx[upper, 1] - yx[lower, 1])) return( list(j=indx.y, i=indx.x, regr.yx.adj=cbind(x, y.adj), slope.yx=slope.yx, max.slope.X=max.slope, maxslope.pt.X=maxslope.pt.X, hitmax.cut.X=hitmax.cut.X, b1.yx.within=b1.yx.within, mu.cY.biased=mu.cY.biased, mu.tY.biased=mu.tY.biased, errmsg=errmsg) ) }