##! # Functions to carry out analysis of variance based on a data file in the format for ANOVA4 (ANV data) # ANOVA4形式のデータファイルをもとに分散分析を行うための関数群 # # read.anv # write.anv # as.anv4 # anv4.fit # print.anv4.fit # anova.anv4.fit # # # @varsion 0.3 # @date 2011-02-06 # @author Masashi Nakanishi. # @licence GPL2 # # @history # 0.3 2011-02-06 # changed to use readLines() and read.csv(), instead of s_c_a_n_(). # replaced e_v_a_l_() with alternatives. # fixed bugs of write.anv(). # 0.2 2010-08-05 # rewrote overall. # 0.1-5 2006-11-24 # changed the encoding of this script file to UTF-8. # 0.1-4 2006-08-26 # fixed a bug of error check in read.anv() # 0.1-3 2006-05-23 # modified 'read.anv()' to make it efficient # changed the error details of 'as.anv4()' # 0.1-2 2006-05-22 # modified 'read.anv()' slightly # 0.1-1 2006-05-06 # the original # Reads a file in ANV format and creates a data frame from it. read.anv <- function(anvfile, print.input=FALSE, outcome.label="Outcome", block.label="Subject") { print.input <- if (is.null(print.input) || is.na(print.input)) FALSE else as.logical(print.input) outcome.label <- make.names(as.character(outcome.label)) block.label <- make.names(as.character(block.label)) if (anvfile == "") anvfile <- stdin() else if (is.character(anvfile)) { anvfile <- file(anvfile, "r") on.exit(close(anvfile)) } if (!inherits(anvfile, "connection")) stop("'anvfile' must be a character string or connection") descript <- readLines(con=anvfile, n=1L, ok=FALSE, warn=FALSE) var_names <- readLines(con=anvfile, n=4L, ok=FALSE, warn=FALSE) n.variable <- as.vector(as.matrix(read.csv(file=anvfile, header=FALSE, nrows=1L, strip.white=TRUE, blank.lines.skip=FALSE))) n.level <- as.vector(as.matrix(read.csv(file=anvfile, header=FALSE, nrows=1L, sep=",", strip.white=TRUE, blank.lines.skip=FALSE))) balanced <- as.logical(as.integer(readLines(con=anvfile, n=1L, ok=FALSE, warn=FALSE))) sizes <- as.vector(as.matrix(read.csv(file=anvfile, header=FALSE, nrows=1L, strip.white=TRUE, blank.lines.skip=FALSE))) data <- read.csv(file=anvfile, header=FALSE) # looser than the original format specification (?) if (length(n.variable) != 3 || any(n.variable < 0) || n.variable[1] != (n.variable[2] + n.variable[3]) || length(n.level) != n.variable[1] || any(n.level < 1) ) stop("invalid input headers") if (balanced != all(sizes==sizes[1])) { if (balanced) stop("balanced but inconsistent cell sizes") else stop("unbalanced but consistent cell sizes") } # not check length(var_names) == n.variable[1] but trim var_names var_names <- var_names[1:n.variable[1]] names(n.variable) <- c("Total", "Between", "Within") names(n.level) <- var_names if (print.input) { print( list(description = descript, n.variable = n.variable, n.level = n.level, balanced = balanced, cell.size = sizes, data = data) ) } # make a data frame data_anv4 <- data.frame(as.vector(t(as.matrix(data)))) colnames(data_anv4)[1] <- outcome.label n.subj <- sum(sizes) is.between <- c(rep(TRUE, n.variable["Between"]), rep(FALSE, n.variable["Within"])) within_lv_prod <- prod(n.level[!is.between]) if (prod(n.level[is.between]) != length(sizes)) stop("number of cells != number of between-subject conditions") if (within_lv_prod * n.subj != nrow(data_anv4)) stop("expected data size != number of observations") cross <- function(levs, r=1) sapply(1:length(levs), function(i)rep(rep(1:levs[i], each=prod(levs[-(1:i)]) ), prod(levs[0:(i-1)])*r)) # between-factors design if (n.variable["Between"]) { des_mat <- matrix(unlist(mapply(rep, as.data.frame(t(cross(n.level[is.between]))), sizes*within_lv_prod)), ncol=n.variable["Between"], byrow=TRUE) colnames(des_mat) <- var_names[is.between] data_anv4 <- cbind(data_anv4, lapply(as.data.frame(des_mat), as.factor)) } # within-factors design if (n.variable["Within"]) { des_mat <- cross(n.level[!is.between], n.subj) colnames(des_mat) <- var_names[!is.between] data_anv4 <- cbind(data_anv4, lapply(as.data.frame(des_mat), as.factor)) data_anv4[block.label] <- gl(n.subj, within_lv_prod) } # check the number of variables if (ncol(data_anv4) < 2 || ncol(data_anv4) != (n.variable["Total"] + ifelse(n.variable["Within"], 2, 1))) stop("unexpected number of variables") # set class and attributes class(data_anv4) <- c("anv4", class(data_anv4)) attr(data_anv4, "description") <- descript attr(data_anv4[[1]], "variable") <- "outcome" if (n.variable["Within"]) attr(data_anv4[[ncol(data_anv4)]], "variable") <- "block" for (i in 1:n.variable["Total"]) attr(data_anv4[[i+1]], "variable") <- if (i <= n.variable["Between"]) "between" else "within" if (print.input) return(invisible(data_anv4)) return(data_anv4) } # Creates an 'anv4' data frame. as.anv4 <- function(x, outcome=NULL, between.factors=NULL, block.factor=NULL, within.factors=NULL, description=NULL) { match.variable <- function(datf, attrib.value) { (1:ncol(datf))[sapply(datf, function(i)!is.null(attr(i, "variable"))&&attr(i, "variable")==attrib.value)] } sub.1n <- function(x, f, vtype) { i <- match.variable(x, vtype) if (!is.null(f) || length(i) == 0) { v <- NULL if (is.list(f)) { if (length(f) == 1 && is.numeric(f[[1]])) { v <- f } } else if (is.vector(f)) { if (is.character(f)) { if (length(f) == 1) v <- x[f] } else if (is.numeric(f)) { if (length(f) == 1) { if (f[1] > 0) v <- x[f] } else { v <- list(f) } } else if (is.logical(f)) { f <- (1:ncol(x))[rep(f, ncol(x))] if (length(f) == 1) v <- x[f] } } if (!is.null(v)) { attr(v[[1]], "variable") <- vtype return(v) } } else if (length(i) == 1) { return(x[i]) } return(NULL) } sub.1f <- function(x, f, vtype) { i <- match.variable(x, vtype) if (!is.null(f) || length(i) == 0) { v <- NULL if (is.list(f)) { if (length(f) == 1 && is.factor(f[[1]])) v <- f } else if (is.factor(f)) { v <- list(f) } else if (is.vector(f)) { if (is.character(f)) { if (length(f) == 1) v <- x[f] } else if (is.numeric(f)) { if (length(f) == 1) if (f[1] > 0) v <- x[f] } else if (is.logical(f)) { f <- (1:ncol(x))[rep(f, ncol(x))] if (length(f) == 1) v <- x[f] } } if (!is.null(v)) { attr(v[[1]], "variable") <- vtype return(v) } } else if (length(i) == 1) { return(x[i]) } return(NULL) } sub.m <- function(x, f, vtype) { i <- match.variable(x, vtype) if (!is.null(f) || length(i) == 0) { v <- NULL if (is.list(f)) { if (all(lapply(f, is.factor))) v <- f } else if (is.factor(f)) { v <- list(f) } else if (is.vector(f)) { if (ncol(x[f]) >= length(f)) v <- lapply(x[f], as.factor) } if (!is.null(v)) { v <- lapply(v, function(i){attr(i, "variable") <- vtype;i}) return(v) } } else { return(x[i]) } return(NULL) } if (!is.data.frame(x)) { x <- as.data.frame(x) } outcome <- sub.1n(x, outcome, "outcome") if (is.null(outcome)) stop("no outcome variable is specified.") if (is.null(names(outcome))) names(outcome) <- "Outcome" between.factors <- sub.m(x, between.factors, "between") within.factors <- sub.m(x, within.factors, "within") block.factor <- sub.1f(x, block.factor, "block") if (!is.null(within.factors) && is.null(block.factor)) stop("within-block variables require a block variable.") if (!is.null(block.factor) && is.null(names(block.factor))) names(block.factor) <- "Block" anv <- data.frame(outcome) if (!is.null(between.factors)) anv <- cbind(anv, between.factors) if (!is.null(within.factors)) anv <- cbind(anv, within.factors) if (!is.null(block.factor)) anv <- cbind(anv, block.factor) # set class and attributes class(anv) <- c("anv4", class(anv)) attr(anv, "description") <- ifelse(!is.null(description), as.character(description), "") return (anv) } # Prints its required argument x (an 'anv4' object or a data frame) to a file or connection in ANV format. write.anv <- function(x, file, description=NULL) { if (!inherits(x, "anv4")) { stop("invalid class of argument 'x'") } match.variable <- function(datf, attrib.value) { (1:ncol(datf))[sapply(datf, function(i)!is.null(attr(i, "variable"))&&attr(i, "variable")==attrib.value)] } outcome.index <- match.variable(x, "outcome") block.index <- match.variable(x, "block") between.index <- match.variable(x, "between") within.index <- match.variable(x, "within") if (length(outcome.index) != 1) stop("not found just 1 outcome variable") if (length(within.index) && length(block.index) != 1) stop("not found just 1 block variable") if (length(block.index) > 1) stop("found needless block variables") if (length(c(between.index, within.index)) == 0) stop("no substantive factor variable") if (length(c(between.index, within.index)) > 4) stop("limited to 4 explanatory variables or less") if (is.null(description)) description <- attr(x, "description") description <- as.character(description) varnames <- variable.names(x)[c(between.index, within.index)] varnames <- c(varnames, rep("", 4-length(varnames))) n.variable <- c(length(c(between.index, within.index)), length(between.index), length(within.index)) n.level <- sapply(x[c(between.index, within.index)], nlevels) if (length(within.index)) { # check that any block is assigned to an unique combination of levels of between-block factors. if (!all(tapply(rep(1, nrow(x)), x[c(within.index, block.index)], length) == 1)) stop("invalid between-block design") } cell.sizes <- as.vector(tapply(rep(1, nrow(x)), x[rev(between.index)], length)) repeated <- prod(n.level[c(rep(FALSE, length(between.index)), rep(TRUE, length(within.index)))]) cell.sizes <- cell.sizes / repeated if (repeated > 1) { # check that all of the combinations of within-block levels has the same times of between-block levels. if (!all( rep(cell.sizes, each=repeated) == as.vector(tapply(rep(1, nrow(x)), x[rev(c(between.index, within.index))], length)) )) stop("invalid within-block design") } balanced <- ifelse(all(cell.sizes == cell.sizes[1]), "1", "0") if (file == "") file <- stdout() else if (is.character(file)) { file <- file(file, "w") on.exit(close(file)) } if (!inherits(file, "connection")) stop("'file' must be a character string or connection") writeLines(strtrim(description, 126), file) # 126 chars ANV format limitation for (v in varnames) writeLines(strtrim(v, 20), file) # 20 chars ANV format limitation writeLines(paste(as.character(n.variable), collapse=","), file) writeLines(paste(as.character(n.level), collapse=","), file) writeLines(balanced, file) writeLines(paste(as.character(cell.sizes), collapse=","), file) # to avoid using e_v_a_l_() idx <- c(between.index, block.index, within.index) ord <- switch(length(idx), order(x[[idx[1]]]), order(x[[idx[1]]], x[[idx[2]]]), order(x[[idx[1]]], x[[idx[2]]], x[[idx[3]]]), order(x[[idx[1]]], x[[idx[2]]], x[[idx[3]]], x[[idx[4]]]), order(x[[idx[1]]], x[[idx[2]]], x[[idx[3]]], x[[idx[4]]], x[[idx[5]]]) ) x <- x[ord,] y <- matrix(x[[outcome.index]], ncol=repeated, byrow=TRUE) mode(y) <- "character" write.table(y, file=file, append=FALSE, quote=FALSE, sep=",", na="", row.names=FALSE, col.names=FALSE) } # Fits an ANOVA model by some ways to the data included in the 'anv4' object required. anv4.fit <- function(data) { if (!inherits(data, "anv4")) stop("invalid class of argument 'data'") match.variable <- function(datf, attrib.value) { (1:ncol(datf))[sapply(datf, function(i){ !is.null(attr(i, "variable")) && attr(i, "variable")==attrib.value })] } outcome.index <- match.variable(data, "outcome") block.index <- match.variable(data, "block") between.index <- match.variable(data, "between") within.index <- match.variable(data, "within") if (length(outcome.index) != 1) stop("not found just 1 outcome variable") if (length(within.index) && length(block.index) != 1) stop("not found just 1 block variable") if (length(c(between.index, within.index)) == 0) stop("no substantive factor variable") varnames <- variable.names(data) # make a formula model <- paste(varnames[outcome.index], "~", paste(varnames[c(between.index, within.index)], collapse="*")) if (length(within.index)) { model.within.block <- paste( paste(c(varnames[block.index], varnames[between.index]), collapse=":"), "/(", paste(varnames[within.index], collapse="*"), ")") model.error <- paste("+ Error(", model.within.block, ")") model.fixed.block <- paste( "+", model.within.block, "-", paste(varnames[c(block.index, between.index, within.index)], collapse=":")) } else { model.error <- "" model.fixed.block <- "" } # execute old.opt <- options(contrasts=c("contr.helmert", "contr.poly")) on.exit(options(old.opt)) ret_list <- list() suppressWarnings( ret_list$aov <- aov(formula=as.formula(paste(model, model.error)), data=data) ) # singular due to no replication ret_list$lm <- lm(formula=as.formula(paste(model, model.fixed.block)), data=data) if (length(within.index)) { if (library("nlme", logical.return=TRUE)) { ret_list$lme <- lme(fixed=as.formula(model), random=as.formula(paste("~ 1 |", varnames[block.index])), data=data) } } class(ret_list) <- c("anv4.fit", class(ret_list)) return(ret_list) # Then # summary(aov.result) # Type I SS # model.tables(aov.result, "means") # TukeyHSD(aov.result, ...) # library(car) # Anova(lm.result, type="II") # Anova(lm.result, type="III") # anova(lme.result) # etc. } print.anv4.fit <- function(x) { print(x$aov) } anova.anv4.fit <- function(x) { s <- list(aov = summary(x$aov)) if (!is.null(x$lm)) s <- c(s, list(lm = anova(x$lm))) if (!is.null(x$lme)) s <- c(s, list(lme = anova(x$lme))) s }