# linmod.R: Example S3 linear model.
#
# See www.milbo.org/doc/modguide.pdf.
# This software may be freely used.

linmod <- function(...) UseMethod("linmod")

linmod.default <- function(x = stop("no 'x' argument"),
                           y = stop("no 'y' argument"),
                           keep = FALSE,
                           ...)
{
    stop.if.dot.arg.used(...)
    xmat <- as.matrix(x)
    # use name "(Intercept)" here so coef names match linmod.formula
    x <- cbind("(Intercept)" = 1, xmat)
    fit <- linmod.fit(x, y)
    fit$call <- match.call()
    if(keep) {
        fit$x <- xmat
        # save y as a one-column matrix, so can use colname to save response name
        colname <- deparse(substitute(y))[1]
        colname <- gsub(" ", "", substr(colname, 1, 100)) # strip spaces, truncate
        fit$y <- as.matrix(y, ncol = 1)
        colnames(fit$y) <- colname
    }
    fit
}
linmod.formula <- function(formula = stop("no 'formula' argument"),
                           data = parent.frame(),
                           keep = FALSE,
                           ...)
{
    stop.if.dot.arg.used(...)
    if(is.matrix(data))             # allow data to be a matrix
        data <- as.data.frame(data) # will create colnames V1 V2 V3 if necessary
    # note that na.action=na.pass because we will catch NAs later
    # in linmod.fit, for uniformity with linmod.default
    mf <- model.frame(formula = formula, data = data, na.action = na.pass)
    terms <- attr(mf, "terms")
    x <- model.matrix(terms, mf)
    y <- model.response(mf)
    fit <- linmod.fit(x, y)
    fit$call <- match.call()
    fit$terms <- terms
    fit$xlevels <- .getXlevels(terms, mf) # for use by predict.linmod
    if(keep)
        fit$data <- data
    fit
}
linmod.fit <- function(x = stop("no 'x' argument"),
                       y = stop("no 'y' argument"),
                       ...)
{
    # internal function, not for the casual user
    # if model has an intercept, the first col of x must be intercept (all 1s)

    stop.if.dot.arg.used(...)
    x <- check.linmod.x(x)
    y <- check.linmod.y(x, y)
    fit <- do.linmod.fit(x, y)
    class(fit) <- "linmod"
    fit
}
check.linmod.x <- function(x)
{
    if(!is.matrix(x))
        stop("'x' is not a matrix or could not be converted to a matrix")
    if(NROW(x) == 0 || NCOL(x) == 0)
        stop("'x' is empty")
    if(anyNA(x))
        stop("NA in 'x'")
    # checking just the first column of x suffices because all columns
    # of a matrix have the same type
    # we allow is.logical because qr etc. treat logical vars as numeric
    if(!is.numeric(x[,1]) && !is.logical(x[,1]))
        stop("non-numeric column in 'x'")
    # ensure all columns in x are named (needed for names in vcov etc.)
    # use the same naming convention as lm (prefix for unnamed cols is "V")
    missing.colnames <-
        if(is.null(colnames(x))) 1:NCOL(x)
        else                     nchar(colnames(x)) == 0
    colnames(x)[missing.colnames] <-
        c("(Intercept)",
          paste("V", seq_len(NCOL(x) - 1), sep = ""))[missing.colnames]
    duplicated <- which(duplicated(colnames(x)))
    if(length(duplicated))
        stop("column name \"", colnames(x)[duplicated[1]],
             "\" in 'x' is duplicated")
    x
}
check.linmod.y <- function(x, y)
{
    # as.vector(as.matrix(y)) is necessary when y is a data.frame
    # (because as.vector alone on a data.frame returns a data.frame)
    y <- as.vector(as.matrix(y))
    if(length(y) == 0)
        stop("'y' is empty")
    if(anyNA(y))
        stop("NA in 'y'")
    if(!is.numeric(y) && !is.logical(y))
        stop("'y' is not numeric or logical")
    if(length(y) != nrow(x))
        stop("nrow(x) is ", nrow(x), " but length(y) is ", length(y))
    y
}
do.linmod.fit <- function(x, y)
{
    # workhorse function for fitting linear models
    # essential processing and sanity checks on x and y are already completed
    # x is a numeric matrix, y is a numeric vector

    qx <- qr(x)                         # QR-decomposition of x
    if(qx$rank < ncol(x))
        stop("'x' is singular (it has ", ncol(x),
             " columns but its rank is ", qx$rank, ")\n  colnames(x): ",
             paste0(colnames(x), collapse=' '))
    coef <- solve.qr(qx, y)             # compute (x'x)^(-1) x'y
    stopifnot(!anyNA(coef))             # NA impossible after rank check above
    df.residual <- max(0, nrow(x) - ncol(x)) # degrees of freedom
    sigma2 <- sum((y - x %*% coef)^2) / df.residual # variance of residuals
    vcov <- sigma2 * chol2inv(qx$qr)    # covar mat is sigma^2 * (x'x)^(-1)
    fitted.values <- qr.fitted(qx, y)

    colnames(vcov) <- rownames(vcov) <- colnames(x)
    names(fitted.values) <- rownames(x)
    colnames(coef) <- colnames(y)

    # returned fields match lm's fields
    list(coefficients  = coef,
         residuals     = y - fitted.values,
         rank          = qx$rank,
         fitted.values = fitted.values,
         vcov          = vcov,
         sigma         = sqrt(sigma2),
         df.residual   = df.residual)
}
predict.linmod <- function(object = stop("no 'object' argument"),
                           newdata = NULL,
                           type = "response",
                           ...)
{
    stopifnot(inherits(object, "linmod"))
    stop.if.dot.arg.used(...)
    match.arg(type, "response") # the type argument is not yet supported
    if(is.null(newdata))
        yhat <- fitted(object)
    else {
        if(NROW(newdata) == 0)
            stop("'newdata' is empty")      # preempt obscure message later

        x <- if(is.null(object$terms))      # model built with linmod.default?
                process.newdata(object, newdata)
            else                            # model built with linmod.formula
                process.newdata.formula(object, newdata)

        # The following tests suffice to catch all illegal input.  However
        # they aren't ideal in that they don't always direct you to the root
        # cause of the problem (i.e. the error messages aren't always optimal).
        nvar <- length(object$coefficients) - 1 # nbr vars, -1 for intercept
        if(ncol(x) - 1 != nvar)
            stop("ncol(newdata) is ", ncol(x) - 1, " but should be ", nvar)
        if(anyNA(x))
            stop("NA in 'newdata'")
        if(!is.numeric(x[,1]) && !is.logical(x[,1]))
            stop("non-numeric column in 'newdata' (after processing)")

        yhat <- as.vector(do.predict.linmod(object, x))
        names(yhat) <- rownames(x)
    }
    yhat
}
process.newdata <- function(object, newdata)
{
    # process newdata for models built with linmod.default

    x <- if(is.vector(newdata))     # allow newdata to be a vector
            matrix(newdata, ncol = length(object$coefficients) - 1)
         else
            as.matrix(newdata)      # allow newdata to be a data.frame

    cbind(1, x)                     # return data with an intercept column
}
process.newdata.formula <- function(object, newdata)
{
    # process newdata for models built with linmod.formula

    newdata <- as.data.frame(newdata) # allows newdata to be a matrix
    terms <- object$terms
    dataClasses <- attr(terms, "dataClasses")
    iresp <- attr(terms, "response")
    terms <- delete.response(terms)
    # na.action=na.pass because we will catch NAs after (for clearer error msg)
    # xlevels is needed to convert strings to factor levels, for example:
    #    a <- linmod(Sepal.Length~Species,data=iris)
    #    predict(a,newdata=data.frame(Species="setosa"))
    mf <- model.frame(terms, newdata, na.action = na.pass, xlev = object$xlevels)
    if(anyNA(mf))
        stop("NA in 'newdata'")
    if(NROW(mf) != NROW(newdata)) {
        # Get here when model.frame() issues
        #   Warning: 'newdata' had M rows but variables found have N rows
        # Must stop, else the call to model.matrix() below would silently return bad data.

        # If a variable is missing, print its name to help the user.
        # TODO This will erroneously identify "sqrt(x)" as a missing var in the
        #      formula "y ~ sqrt(x)" (because the var is wrapped in a func call).
        varnames <- names(dataClasses)
        varnames <- varnames[-iresp]
        missing <- which(!(varnames %in% colnames(newdata)))
        missing.msg <- ""
        if(length(missing))
            missing.msg <- paste0(" (variable '", varnames[missing[1]],
                                  "' may be missing from newdata)")
        stop("newdata has ", NROW(newdata),
             " rows but model.frame returned ", NROW(mf), " rows", missing.msg)
    }
    .checkMFClasses(dataClasses, mf) # check types in newdata match original data
    model.matrix(terms, mf)
}
do.predict.linmod <- function(object, x)
{
    # workhorse function for linear model predictions
    # processing by model.matrix etc. and sanity checks on x already completed
    # x is a numeric matrix (if model has intercept, first col of x is all 1s)

    x %*% coef(object)
}
summary.linmod <- function(object = stop("no 'object' argument"), ...)
{
    stop.if.dot.arg.used(...)
    se <- sqrt(diag(object$vcov))
    t.value <- coef(object) / se
    p.value <-
        if(object$df.residual == 0) # avoid warning from pt()
            rep_len(0, length.out=length(t.value))
        else
            2 * pt(-abs(t.value), df = object$df.residual)

    coefficients <- cbind(Estimate = coef(object),
                          StdErr   = se,
                          t.value  = t.value,
                          p.value  = p.value)

    retval <- list(call         = object$call,
                   coefficients = coefficients)

    class(retval) <- "summary.linmod"

    retval
}
print.linmod <- function(x = stop("no 'x' argument"), ...)
{
    stop.if.dot.arg.used(...)
    print.model.call(x)
    print(x$coefficients)
    invisible(x)
}
print.summary.linmod <- function(x = stop("no 'x' argument"), ...)
{
    stop.if.dot.arg.used(...)
    print.model.call(x)
    print(x$coefficients)
    invisible(x)
}
print.model.call <- function(x)
{
    cat("Call: ") # print.lm has a newline here, but a space is more compact
    # use paste0 to convert vector of strings to single string if necessary
    cat(strwrap(paste0(deparse(x$call, control = NULL, nlines = 5),
                       sep = " ", collapse = " "), exdent = 6), sep = "\n")
    cat("\n")
}
# stop.if.dot.arg.used will cause an error message if any args are passed to it.
# We use it to test if any dots arg of the calling function was used, for
# functions that must have a dots arg (to match the generic method) but don't
# actually use the dots.  This helps the user catch mistyped or illegal args.
# R version 3.3-0 or higher has a function chkDots which could be used instead.

stop.if.dot.arg.used <- function()
{
    NULL
}