diffdf coverage - 97.03%

Files
Source

#' Generate unique key name
#'
#' Function to generate a name for the keys if not provided
#'
#' @param BASE base dataset
#' @param COMP comparison dataset
#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
#'
#' @keywords internal
generate_keyname <- function(
    BASE,
    COMP,
    replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
) {
    assertthat::assert_that(
        is(replace_names, "character"),
        msg = "replace_names is not a character vector"
    )

    assertthat::assert_that(
        length(replace_names) != 0,
        msg = "All default row names are in use in BASE/COMPARE. Please provide a KEY argument"
    )


    key_name <- replace_names[1]

    if (!is.null(BASE[[key_name]]) || !is.null(COMP[[key_name]])) {
        key_name <- generate_keyname(BASE, COMP, replace_names[-1])
    }
    return(key_name)
}

#' diffdf
#' @description
#' Compares 2 dataframes and outputs any differences.
#' @param base input dataframe
#' @param compare comparison dataframe
#' @param keys vector of variables (as strings) that defines a unique row in
#' the base and compare dataframes
#' @param strict_numeric Flag for strict numeric to numeric comparisons
#' (default = TRUE). If False diffdf will cast integer to double where
#' required for comparisons. Note that variables specified in the keys
#' will never be casted.
#' @param strict_factor Flag for strict factor to character comparisons
#' (default = TRUE). If False diffdf will cast factors to characters where
#' required for comparisons. Note that variables specified in the keys will
#' never be casted.
#' @param suppress_warnings Do you want to suppress warnings? (logical)
#' @param file Location and name of a text file to output the results to.
#' Setting to NULL will cause no file to be produced.
#' @param tolerance Set tolerance for numeric comparisons. Note that
#' comparisons fail if (x-y)/scale > tolerance.
#' @param scale Set scale for numeric comparisons. Note that comparisons fail
#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
#' version of scale = 1.
#' @param check_column_order Should the column ordering be checked? (logical)
#' @param check_df_class Do you want to check for differences in the class
#' between `base` and `compare`? (logical)
#' @examples
#' x <- subset(iris, -Species)
#' x[1, 2] <- 5
#' COMPARE <- diffdf(iris, x)
#' print(COMPARE)
#'
#' #### Sample data frames
#'
#' DF1 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, 3, NA)
#' )
#'
#' DF2 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 7),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, NA, NA),
#'     v3 = c(NA, NA, 1, 2, NA, 4)
#' )
#'
#' diffdf(DF1, DF1, keys = "id")
#'
#' # We can control matching with scale/location for example:
#'
#' DF1 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(1, 2, 3, 4, 5, 6)
#' )
#' DF2 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(1.1, 2, 3, 4, 5, 6)
#' )
#'
#' diffdf(DF1, DF2, keys = "id")
#' diffdf(DF1, DF2, keys = "id", tolerance = 0.2)
#' diffdf(DF1, DF2, keys = "id", scale = 10, tolerance = 0.2)
#'
#' # We can use strict_factor to compare factors with characters for example:
#'
#' DF1 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, 3, NA),
#'     stringsAsFactors = FALSE
#' )
#'
#' DF2 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, 3, NA)
#' )
#'
#' diffdf(DF1, DF2, keys = "id", strict_factor = TRUE)
#' diffdf(DF1, DF2, keys = "id", strict_factor = FALSE)
#'
#' @export
diffdf <- function(
    base,
    compare,
    keys = NULL,
    suppress_warnings = FALSE,
    strict_numeric = TRUE,
    strict_factor = TRUE,
    file = NULL,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL,
    check_column_order = FALSE,
    check_df_class = FALSE
) {

    assertthat::assert_that(
        assertthat::is.flag(check_df_class),
        !is.na(check_df_class),
        msg = "`check_df_class` must be a length 1 logical"
    )

    BASE <- base
    COMP <- compare
    KEYS <- keys
    SUPWARN <- suppress_warnings

    ### Initatiate output object
    COMPARE <- list()
    class(COMPARE) <- c("diffdf", "list")


    BASE_NAME <- deparse(substitute(base), width.cutoff = 30L)
    if (length(BASE_NAME) > 1) {
        BASE_NAME <- "<BASE>"
    }
    COMP_NAME <- deparse(substitute(compare), width.cutoff = 30L)
    if (length(COMP_NAME) > 1) {
        COMP_NAME <- "<COMPARE>"
    }

    COMPARE[["DataSummary"]] <- construct_issue(
        value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
        message = "Summary of BASE and COMPARE"
    )


    is_derived <- FALSE

    ### If no key is suplied match values based upon row number
    if (is.null(KEYS)) {
        is_derived <- TRUE
        keyname <- generate_keyname(BASE, COMP)
        BASE[[keyname]] <- seq_len(nrow(BASE))
        COMP[[keyname]] <- seq_len(nrow(COMP))
        KEYS <- keyname
    }
    attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)

    assertthat::assert_that(
        is.numeric(tolerance),
        is.numeric(scale) || is.null(scale)
    )

    missing_keys_base <- KEYS[!KEYS %in% names(BASE)]
    assertthat::assert_that(
        length(missing_keys_base) == 0,
        msg = sprintf(
            "The following KEYS are not available in BASE:\n   %s",
            paste(missing_keys_base, collapse = "\n   ")
        )
    )

    missing_keys_comp <- KEYS[!KEYS %in% names(COMP)]
    assertthat::assert_that(
        length(missing_keys_comp) == 0,
        msg = sprintf(
            "The following KEYS are not available in COMPARE:\n   %s",
            paste(missing_keys_comp, collapse = "\n   ")
        )
    )

    assertthat::assert_that(
        has_unique_rows(BASE, KEYS),
        msg = "BY variables in BASE do not result in unique observations"
    )

    assertthat::assert_that(
        has_unique_rows(COMP, KEYS),
        msg = "BY variables in COMPARE do not result in unique observations"
    )



    #### Check essential variable properties (class & mode)

    COMPARE[["UnsupportedColsBase"]] <- construct_issue(
        value = identify_unsupported_cols(BASE),
        message = "There are columns in BASE with unsupported modes !!"
    )


    COMPARE[["UnsupportedColsComp"]] <- construct_issue(
        value = identify_unsupported_cols(COMP),
        message = "There are columns in COMPARE with unsupported modes !!"
    )


    # cast variables if strict is off
    if (!strict_factor || !strict_numeric) {
        casted_df <- cast_variables(
            BASE = BASE,
            COMPARE = COMP,
            ignore_vars = KEYS,
            cast_integers = !strict_numeric,
            cast_factors = !strict_factor
        )

        BASE <- casted_df$BASE
        COMP <- casted_df$COMP
    }


    COMPARE[["VarModeDiffs"]] <- construct_issue(
        value = identify_mode_differences(BASE, COMP),
        message = "There are columns in BASE and COMPARE with different modes !!"
    )


    COMPARE[["VarClassDiffs"]] <- construct_issue(
        value = identify_class_differences(BASE, COMP),
        message = "There are columns in BASE and COMPARE with different classes !!"
    )




    ##### Check Validity of Keys

    BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
    COMP_keys <- names(COMP)[names(COMP) %in% KEYS]

    assertthat::assert_that(
        length(BASE_keys) == length(KEYS),
        msg = "BASE is missing variables specified in KEYS"
    )

    assertthat::assert_that(
        length(COMP_keys) == length(KEYS),
        msg = "COMP is missing variables specified in KEYS"
    )


    assert_valid_keys(
        COMPARE, KEYS, "UnsupportedColsBase",
        "The following KEYS in BASE have an unsupported mode (see `?mode()`)"
    )
    assert_valid_keys(
        COMPARE, KEYS, "UnsupportedColsComp",
        "The following KEYS in COMPARE have an unsupported mode (see `?mode()`)"
    )
    assert_valid_keys(
        COMPARE, KEYS, "VarModeDiffs",
        "The following KEYS have different modes between BASE and COMPARE"
    )
    assert_valid_keys(
        COMPARE, KEYS, "VarClassDiffs",
        "The following KEYS have different classes between BASE and COMPARE"
    )


    exclude_cols <- c(
        COMPARE[["UnsupportedColsBase"]]$VARIABLE,
        COMPARE[["UnsupportedColsComp"]]$VARIABLE,
        COMPARE[["VarClassDiffs"]]$VARIABLE,
        COMPARE[["VarModeDiffs"]]$VARIABLE
    )

    if (check_column_order) {
        if (attr(COMPARE, "keys")$is_derived) {
            keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
            keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
        } else {
            keep_vars_base <- TRUE
            keep_vars_comp <- TRUE
        }
        COMPARE[["ColumnOrder"]] <- construct_issue(
            value = identify_column_order_differences(
                BASE[, keep_vars_base, drop = FALSE],
                COMP[, keep_vars_comp, drop = FALSE]
            ),
            message = "There are differences in the column ordering between BASE and COMPARE !!"
        )
    }


    ##### Check Attributes
    COMPARE[["AttribDiffs"]] <- construct_issue(
        value = identify_att_differences(BASE, COMP, exclude_cols),
        message = "There are columns in BASE and COMPARE with differing attributes !!"
    )


    ##### Check data

    BASE <- factor_to_character(BASE, KEYS)
    COMP <- factor_to_character(COMP, KEYS)


    COMPARE[["ExtRowsBase"]] <- construct_issue(
        value = identify_extra_rows(BASE, COMP, KEYS),
        message = "There are rows in BASE that are not in COMPARE !!"
    )


    COMPARE[["ExtRowsComp"]] <- construct_issue(
        value = identify_extra_rows(COMP, BASE, KEYS),
        message = "There are rows in COMPARE that are not in BASE !!"
    )


    COMPARE[["ExtColsBase"]] <- construct_issue(
        value = identify_extra_cols(BASE, COMP),
        message = "There are columns in BASE that are not in COMPARE !!"
    )


    COMPARE[["ExtColsComp"]] <- construct_issue(
        value = identify_extra_cols(COMP, BASE),
        message = "There are columns in COMPARE that are not in BASE !!"
    )


    VALUE_DIFFERENCES <- identify_differences(
        BASE, COMP, KEYS, exclude_cols,
        tolerance = tolerance,
        scale = scale
    )



    ## Summarise the number of mismatching rows per variable

    if (length(VALUE_DIFFERENCES)) {
        NDIFF <- sapply(VALUE_DIFFERENCES, nrow)
        COMPARE[["NumDiff"]] <- construct_issue(
            value = convert_to_issue(NDIFF),
            message = "Not all Values Compared Equal"
        )
    }


    for (i in names(VALUE_DIFFERENCES)) {
        COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
            value = VALUE_DIFFERENCES[[i]],
            message = NULL
        )
    }


    # suppress warning message of data summary if user didn't request to check it
    # we leave the issue in the main compare object though for printing purposes
    COMPARE_WARNINGS <- COMPARE
    attr(COMPARE_WARNINGS[["DataSummary"]], "message") <- c(
        "There are differences between the class of BASE and COMPARE"
    )
    if (!check_df_class || identical(class(base), class(compare))) {
        COMPARE_WARNINGS["DataSummary"] <- NULL
    }

    # Get all issue messages, remove blank message, and collapse into single string
    ISSUE_MSGS <- sapply(COMPARE_WARNINGS, function(x) get_issue_message(x))
    ISSUE_MSGS <- Filter(function(x) !is.null(x), ISSUE_MSGS)
    ISSUE_MSGS <- Filter(function(x) x != "", ISSUE_MSGS)

    if (length(ISSUE_MSGS) != 0) {
        if (!SUPWARN) {
            ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
            warning(c("\n", ISSUE_MSGS))
        }
    }

    # If the classes are the same and it is the only entry in the compare
    # object then remove it in order to trigger "no issues found"
    if (identical(class(base), class(compare)) && length(COMPARE) == 1) {
        COMPARE["DataSummary"] <- NULL
    }

    # If the summary is the only item and the user didn't want to check classes
    # then remove the object to trigger the "no issues found"
    if (!check_df_class && length(COMPARE) == 1) {
        COMPARE["DataSummary"] <- NULL
    }


    if (!is.null(file)) {
        print(COMPARE, file = file)
    }

    return(COMPARE)
}




#' diffdf_has_issues
#'
#' Utility function which returns TRUE if an diffdf
#' object has issues or FALSE if an diffdf object does not have issues
#' @param x diffdf object
#' @examples
#'
#' # Example with no issues
#' x <- diffdf(iris, iris)
#' diffdf_has_issues(x)
#'
#' # Example with issues
#' iris2 <- iris
#' iris2[2, 2] <- NA
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
#' diffdf_has_issues(x)
#' @export
diffdf_has_issues <- function(x) {
    if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
    length(x) != 0
}


#' Assert that keys are valid
#'
#' Utility function to check that user provided "keys" aren't listed as a problem
#' variable of the current list of issues.
#' @param COMPARE (`list`)\cr A named list of which each element is a `data.frame` with the
#' column `VARIABLE`
#' @param KEYS (`character`)\cr name of key variables to check to make sure they don't contain
#' any issues
#' @param component (`character`)\cr name of the component within `COMPARE` to check against
#' @param msg (`character`)\cr error message to print if any of `KEYS` are found within
#' `COMPARE[component]$VARIABLE`
#' @keywords internal
assert_valid_keys <- function(COMPARE, KEYS, component, msg) {
    keys_reduced <- KEYS[KEYS %in% COMPARE[[component]]$VARIABLE]
    assertthat::assert_that(
        length(keys_reduced) == 0,
        msg = sprintf(
            "%s:\n%s",
            msg,
            paste0("`", paste0(keys_reduced, collapse = "`, `"), "`")
        )
    )
}

#' Identify Issue Rows
#'
#' This function takes a `diffdf` object and a dataframe and subsets
#' the `data.frame` for problem rows as identified in the comparison object.
#' If \code{vars} has been specified only issue rows associated with those
#' variable(s) will be returned.
#' @param df dataframe to be subsetted
#' @param diff diffdf object
#' @param vars (optional) character vector containing names of issue variables to subset dataframe
#' on. A value of NULL (default) will be taken to mean available issue variables.
#' @examples
#' iris2 <- iris
#' for (i in 1:3) iris2[i, i] <- 99
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
#' diffdf_issuerows(iris, x)
#' diffdf_issuerows(iris2, x)
#' diffdf_issuerows(iris2, x, vars = "Sepal.Length")
#' diffdf_issuerows(iris2, x, vars = c("Sepal.Length", "Sepal.Width"))
#' @details
#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only
#' requirement is that the original variables specified in the keys argument to diffdf
#' are present on the dataframe you are subsetting against. However please note that if
#' no keys were specified in diffdf then the row number is used. This means using
#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in
#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
#' @export
diffdf_issuerows <- function(df, diff, vars = NULL) {

    assertthat::assert_that(
        class(diff)[[1]] == "diffdf"
    )


    KEYS_ATT <- attr(diff, "keys")

    assertthat::assert_that(
        !is.null(KEYS_ATT),
        msg = "diff is missing the keys attribute"
    )


    issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]

    if (is.null(vars)) {
        vars <- issue_vars
    } else {
        vars <- paste0("VarDiff_", vars)
    }

    if (length(issue_vars) == 0 || sum(vars %in% issue_vars) == 0) {
        return(df[FALSE, ])
    }

    KEEP <- mapply(
        FUN      = get_issue_dataset,
        issue    = vars,
        diff     = list(diff),
        SIMPLIFY = FALSE
    )

    KEEP <- recursive_reduce(KEEP, rbind)
    KEEP <- KEEP[!duplicated(KEEP), ]

    if (KEYS_ATT$is_derived) {
        df[[KEYS_ATT$value]] <- seq_len(nrow(df))
    }

    keys <- KEYS_ATT$value

    if (any(!keys %in% names(df))) {
        stop("df does not contain all variables specified as keys in diff")
    }

    RET <- merge(
        x = df,
        y = KEEP,
        sort = TRUE
    )

    RET <- RET[do.call("order", RET[keys]), ]

    if (KEYS_ATT$is_derived) {
        keep_vars <- !names(RET) %in% KEYS_ATT$value
        RET <- RET[, keep_vars, drop = FALSE]
    }

    return(RET)
}




#' get_issue_dataset
#'
#' Internal function used by `diffdf_issuerows` to extract the dataframe
#' from each a target issue. In particular it also strips off any
#' non-key variables
#' @param issue name of issue to extract the dataset from diff
#' @param diff diffdf object which contains issues
#' @keywords internal
get_issue_dataset <- function(issue, diff) {
    issue_df <- diff[[issue]]
    keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
    issue_df[, keep, drop = FALSE]
}

#' Pad String
#'
#' Utility function used to replicate `str_pad`. Adds white space to either end
#' of a string to get it to equal the desired length
#' @param x string
#' @param width desired length
#' @keywords internal
string_pad <- function(x, width) {
    if (nchar(x) >= width) {
        return(x)
    }
    width <- width - nchar(x)
    left <- paste0(rep(" ", floor(width / 2)), collapse = "")
    right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
    paste0(left, x, right, collapse = "")
}


#' recursive_reduce
#'
#' Utility function used to replicated `purrr::reduce`. Recursively applies a
#' function to a list of elements until only 1 element remains
#' @param .l list of values to apply a function to
#' @param .f function to apply to each each element of the list in turn. See details.
#' @details
#' This function is essentially performing the following operation:
#' ```
#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
#' ```
#' @keywords internal
recursive_reduce <- function(.l, .f) {
    if (length(.l) != 1) {
        .l[[2]] <- .f(.l[[1]], .l[[2]])
        return(recursive_reduce(.l[-1], .f))
    }
    .l[[1]]
}

#' invert
#'
#' Utility function used to replicated `purrr::transpose`. Turns a list inside
#' out.
#' @param x list
#' @keywords internal
invert <- function(x) {
    x2 <- list()
    cnames <- names(x)
    tnames <- names(x[[1]])
    for (i in tnames) {
        x2[[i]] <- list()
        for (j in cnames) {
            x2[[i]][[j]] <- x[[j]][[i]]
        }
    }
    x2
}




#' as_ascii_table
#'
#' This function takes a `data.frame` and attempts to convert it into
#' a simple ascii format suitable for printing to the screen
#' It is assumed all variable values have a `as.character()` method
#' in order to cast them to character.
#' @param dat Input dataset to convert into a ascii table
#' @param line_prefix Symbols to prefix in front of every line of the table
#' @keywords internal
as_ascii_table <- function(dat, line_prefix = "  ") {
    n_col <- ncol(dat)
    n_row <- nrow(dat)

    ## Convert every value to character and crop to a suitable length
    dat_char <- lapply(dat, as_fmt_char)


    hold <- list()
    COLS <- colnames(dat)

    ### For each column extract core elements (width, values , title) and pad out
    ### each string to be a suitable length
    for (i in seq_len(n_col)) {
        COL <- COLS[i]
        VALUES <- dat_char[[i]]

        JOINT <- c(COL, VALUES)
        WIDTH <- max(sapply(JOINT, nchar)) + 2

        hold[[COL]] <- list()
        hold[[COL]]$WIDTH <- WIDTH
        hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH)
        hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH)
    }

    ### Collapse into a single value per component ( title , values, width )
    thold <- invert(hold)
    tvals <- recursive_reduce(thold$VALUES, paste0)
    thead <- recursive_reduce(thold$HEADER, paste0)
    twidth <- recursive_reduce(thold$WIDTH, sum)

    ### Create header and footer lines
    TLINE <- paste0(rep("=", twidth), collapse = "")
    LINE <- paste0(rep("-", twidth), collapse = "")
    FVALS <- paste0(line_prefix, tvals, collapse = "\n")

    ### Output table
    paste0(
        line_prefix, TLINE, "\n",
        line_prefix, thead, "\n",
        line_prefix, LINE, "\n",
        FVALS, "\n",
        line_prefix, LINE
    )
}


#' as_character
#'
#' Stub function to enable mocking in unit tests
as_character <- as.character

#' Format vector to printable string
#'
#' Coerces a vector of any type into a printable string. The most
#' significant transformation is performed on existing character
#' vectors which will be truncated, have newlines converted
#' to explicit symbols and will be wrapped in quotes if they
#' contain white space.
#'
#' @param x (`vector`) \cr vector to be converted to character
#' @param add_quotes (`logical`) \cr if true will wrap strings that contain
#' whitespace with quotes
#' @param crop_at (`numeric`) \cr specifies the limit at which strings should
#' be truncated to
#' @param ... additional arguments (not currently used)
#'
#' @name as_fmt_char
#' @keywords internal
as_fmt_char <- function(x, ...) {
    UseMethod("as_fmt_char")
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.numeric <- function(x, ...) {
    format(x, digits = 7, justify = "right")
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.NULL <- function(x, ...) {
    "<NULL>"
}

#' @importFrom utils capture.output
#' @rdname as_fmt_char
#' @export
as_fmt_char.list <- function(x, ...) {
    vapply(
        x,
        function(x) {
            if (is.numeric(x)) {
                return(as_fmt_char(x))
            }
            if (is.character(x) & length(x) == 1) {
                return(as_fmt_char(x))
            }
            as_fmt_char(
                paste(capture.output(dput(x)), collapse = " "),
                add_quotes = FALSE
            )
        },
        character(1)
    )
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.factor <- function(x, ...) {
    as_fmt_char(as.character(x))
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.character <- function(x, add_quotes = TRUE, crop_at = 30, ...) {
    needs_quotes <- grepl("\\s", x) & add_quotes

    x[is.na(x)] <- "<NA>"

    # Replace \nl \cr with tags to stop print message splitting over
    # multiple lines
    x <- gsub("\x0D", "<cr>", x)
    x <- gsub("\x0A", "<nl>", x)

    charlength <- vapply(x, nchar, numeric(1))
    x <- substr(x, 1, crop_at)
    x[charlength > crop_at] <- paste0(x[charlength > crop_at], "...")

    # Add enclosing " " around strings with white space so that it can be
    # clearly identified in the printed output
    x[needs_quotes] <- paste0('"', x[needs_quotes], '"')

    x
}


#' @rdname as_fmt_char
#' @export
as_fmt_char.default <- function(x, ...) {
    x_char <- as_character(x)
    assertthat::assert_that(
        is.character(x_char),
        msg = sprintf(
            "Unable to convert class `'%s'` to character for printing purposes",
            paste(class(x), collapse = "', '")
        )
    )
    as_fmt_char.character(x_char, add_quotes = FALSE)
}


#' @rdname as_fmt_char
#' @export
as_fmt_char.POSIXt <- function(x, ...) {
    x <- format(x, "%Y-%m-%d %H:%M:%S %Z")
    x[is.na(x)] <- "<NA>"
    x
}


#' get_table
#'
#' Generate nice looking table from a data frame
#' @param dsin dataset
#' @inheritParams print.diffdf
#' @keywords internal
get_table <- function(dsin, row_limit = 10) {
    if (nrow(dsin) == 0) {
        return("")
    }
    if (!is.null(row_limit)) {
        assertthat::assert_that(
            assertthat::is.number(row_limit),
            row_limit > 0,
            msg = "row_limit must be a positive integer"
        )
    }
    if (is.null(row_limit)) {
        display_table <- dsin
    } else {
        display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1))
    }

    add_message <- if (!is.null(row_limit) && nrow(dsin) > row_limit) {
        paste0(
            "First ",
            row_limit,
            " of ",
            nrow(dsin),
            " rows are shown in table below"
        )
    } else {
        NULL
    }

    msg <- paste(
        c(
            add_message,
            as_ascii_table(display_table)
        ),
        collapse = "\n"
    )
    msg
}

#' Print diffdf objects
#'
#' Print a nicely formatted version of a diffdf object.
#'
#' @param x A comparison object created by \code{diffdf()}.
#' @param ... Additional arguments (not used).
#' @param row_limit Maximum number of rows to display in difference tables.
#' Use \code{NULL} to show all rows. Default is 10.
#' @param as_string Logical. If \code{TRUE}, returns the printed message as an R
#' character vector instead of printing to the console. Default is \code{FALSE}.
#' @param file A connection or a character string naming the file to print to. If
#' \code{NULL} (the default), output is printed to the console.
#'
#' @examples
#' x <- subset(iris, -Species)
#' x[1, 2] <- 5
#' COMPARE <- diffdf(iris, x)
#' print(COMPARE)
#' print(COMPARE, row_limit = 5)
#' \dontrun{
#' print(COMPARE, file = "output.txt")
#' }
#'
#' @export
print.diffdf <- function(x, row_limit = 10, as_string = FALSE, file = NULL, ...) {
    if (!is.null(row_limit)) {
        assertthat::assert_that(
            assertthat::is.number(row_limit),
            row_limit > 0,
            msg = "row_limit must be a positive integer"
        )
    }
    assertthat::assert_that(
        assertthat::is.flag(as_string)
    )
    COMPARE <- x

    if (length(COMPARE) == 0) {
        outtext <- "No issues were found!\n"
    } else {
        start_text <- paste0("Differences found between the objects!\n\n")
        end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit))
        end_text <- paste0(unlist(end_text), collapse = "")
        outtext <- paste0(start_text, end_text)
    }

    string_content <- strsplit(outtext, "\n")[[1]]
    if (!is.null(file)) {
        tryCatch(
            {
                sink(file)
                cat(string_content, sep = "\n")
                sink()
            },
            warning = function(w) {
                sink()
                warning(w)
            },
            error = function(e) {
                sink()
                stop(e)
            }
        )
        return(invisible(COMPARE))
    }

    if (as_string) {
        return(string_content)
    } else {
        cat(outtext)
        return(invisible(COMPARE))
    }
}

#' sort_then_join
#'
#' Convenience function to sort two strings and paste them together
#' @param string1 first string
#' @param string2 second string
#' @keywords internal
sort_then_join <- function(string1, string2) {
    paste0(sort(c(string1, string2)), collapse = "")
}


#' class_merge
#'
#' Convenience function to put all classes an object has into one string
#' @param x an object
#' @keywords internal
class_merge <- function(x) {
    paste(class(x), collapse = "_")
}


get_message <- function(colname, whichdat, totype) {
    message(paste0(
        "NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype
    ))
}


#' get_casted_vector
#'
#' casts a vector depending on its type and input
#' @param colin column to cast
#' @param colname name of vector
#' @param whichdat whether base or compare is being casted (used for messages)
#' @importFrom methods is
#' @keywords internal
get_casted_vector <- function(colin, colname, whichdat) {
    if (is(colin, "factor")) {
        get_message(colname, whichdat, "character")
        return(as.character(colin))
    }

    if (is(colin, "integer")) {
        get_message(colname, whichdat, "numeric")
        return(as.numeric(colin))
    }

    colin
}



#' get_casted_dataset
#'
#' Internal utility function to loop across a dataset casting all target
#' variables
#' @param df dataset to be casted
#' @param columns columns to be casted
#' @param whichdat whether base or compare is being casted (used for messages)
#' @keywords internal
get_casted_dataset <- function(df, columns, whichdat) {
    for (col in columns) {
        df[[col]] <- get_casted_vector(df[[col]], col, whichdat)
    }
    return(df)
}






#' cast_variables
#'
#' Function to cast datasets columns if they have differing types
#' Restricted to specific cases, currently integer and double, and character and factor
#' @param BASE base dataset
#' @param COMPARE comparison dataset
#' @param ignore_vars Variables not to be considered for casting
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
#' @keywords internal
cast_variables <- function(
    BASE,
    COMPARE,
    ignore_vars = NULL,
    cast_integers = FALSE,
    cast_factors = FALSE
) {

    allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]

    BASE_class <- data.frame(
        class_BASE = sapply(BASE, class_merge),
        colname = names(BASE),
        stringsAsFactors = FALSE
    )
    BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE]


    COMPARE_class <- data.frame(
        class_COMPARE = sapply(COMPARE, class_merge),
        colname = names(COMPARE),
        stringsAsFactors = FALSE
    )
    COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE]

    common_class <- merge(
        x = BASE_class,
        y = COMPARE_class,
        by = "colname"
    )


    diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE]


    diff_class$classmerge <- mapply(
        sort_then_join,
        diff_class$class_COMPARE,
        diff_class$class_BASE
    )


    cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE]


    DATASETS <- list(
        "BASE" = BASE,
        "COMPARE" = COMPARE
    )

    if (nrow(cast_columns) == 0) {
        return(DATASETS)
    }


    for (i in names(DATASETS)) {
        DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i)
    }

    return(DATASETS)
}

#' is_variable_different
#'
#' This subsets the data set on the variable name, picks out differences and returns a `tibble`
#' of differences for the given variable
#' @importFrom tibble as_tibble
#' @param variablename name of variable being compared
#' @param keynames name of keys
#' @param datain Inputted dataset with base and compare vectors
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @return A boolean vector which is T if target and current are different
#' @keywords internal
is_variable_different <- function(variablename, keynames, datain, ...) {
    xvar <- paste0(variablename, ".x")
    yvar <- paste0(variablename, ".y")

    assertthat::assert_that(
        xvar %in% names(datain) && yvar %in% names(datain),
        msg = "Variable does not exist within input dataset"
    )

    target <- datain[[xvar]]
    current <- datain[[yvar]]
    outvect <- find_difference(target, current, ...)

    datain[["VARIABLE"]] <- variablename

    names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")

    x <- as_tibble(
        subset(
            datain,
            outvect,
            select = c("VARIABLE", keynames, "BASE", "COMPARE")
        )
    )

    return(x)
}

#' compare_vectors
#'
#' Compare two vectors looking for differences
#'
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
compare_vectors <- function(target, current, ...) {
    UseMethod("compare_vectors")
}


#' find_difference
#'
#' This determines if two vectors are different. It expects vectors of the same
#' length and type, and is intended to be used after checks have already been done
#' Initially picks out any `NA`'s (matching `NA`'s count as a match)
#' Then compares remaining vector
#'
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
find_difference <- function(target, current, ...) {
    if (length(target) != length(current)) {
        warning("Inputs are not of the same length")
        return(NULL)
    }

    if (is.null(target) || is.null(current)) {
        return(is.null(target) != is.null(current))
    }

    ### Initalise output, assume problem unless evidence otherwise
    return_vector <- rep(TRUE, length(target))

    nas_t <- is.na(target)
    nas_c <- is.na(current)

    ## compare missing values
    nacompare <- nas_t != nas_c
    naselect <- nas_t | nas_c
    return_vector[naselect] <- nacompare[naselect]

    ## compare non-missing values
    selectvector <- as.logical((!nas_t) * (!nas_c))

    comparevect <- compare_vectors(
        target[selectvector],
        current[selectvector],
        ...
    )

    return_vector[selectvector] <- comparevect

    return(return_vector)
}







#' compare_vectors.default
#'
#' Default method, if the vector is not numeric or factor. Basic comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
compare_vectors.default <- function(target, current, ...) {
    target != current
}




#' compare_vectors.factor
#'
#' Compares factors. Sets them as character and then compares
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
compare_vectors.factor <- function(target, current, ...) {
    as.character(target) != as.character(current)
}





#' compare_vectors.numeric
#'
#' This is a modified version of the all.equal function
#' which returns a vector rather than a message
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
#' @param ... Not used
#' @keywords internal
compare_vectors.numeric <- function(
    target,
    current,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL,
    ...
) {

    out <- target == current

    if (all(out)) {
        return(!out)
    }

    if (is.integer(target) || is.integer(current)) {
        target <- as.double(target)
        current <- as.double(current)
    }

    xy <- abs(target - current)

    if (!is.null(scale)) {
        xy <- xy / scale
    }

    return(xy > tolerance)
}

#' compare_vectors.int64
#'
#' Handle int64 vectors. Uses numeric comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
#' @param ... Not used
#' @keywords internal
compare_vectors.integer64 <- function(
    target,
    current,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL,
    ...
) {
    compare_vectors.numeric(target, current, tolerance, scale)
}

#' identify_extra_rows
#'
#' Identifies rows that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @keywords internal
identify_extra_rows <- function(DS1, DS2, KEYS) {
    if (nrow(DS2) == 0 || nrow(DS1) == 0) {
        return(DS1[, KEYS, drop = FALSE])
    }
    DS2[["..FLAG.."]] <- "Y"
    dat <- merge(
        subset(DS1, select = KEYS),
        subset(DS2, select = c(KEYS, "..FLAG..")),
        by = KEYS, all.x = TRUE,
        sort = TRUE
    )
    dat <- dat[do.call("order", dat[KEYS]), ]

    dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
}



#' identify_extra_cols
#'
#' Identifies columns that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @importFrom tibble tibble
#' @keywords internal
identify_extra_cols <- function(DS1, DS2) {
    match.cols <- sapply(names(DS1), "%in%", names(DS2))
    assertthat::assert_that(
        all(is.logical(match.cols)),
        msg = "Assumption of logical return type is not true"
    )
    tibble(
        COLUMNS = names(DS1)[!match.cols]
    )
}







#' identify_matching_cols
#'
#' Identifies columns with the same name in two data frames
#' @param DS1 Input dataset 1 (data frame)
#' @param DS2 Input dataset 2 (data frame)
#' @param EXCLUDE Columns to ignore
#' @keywords internal
identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
    match_cols <- sapply(names(DS1), "%in%", names(DS2))
    exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
    names(DS1)[match_cols & !exclude_cols]
}





#' identify_unsupported_cols
#'
#' Identifies any columns for which the package is not setup to handle
#' @param dsin input dataset
#' @keywords internal
identify_unsupported_cols <- function(dsin) {
    dat <- subset(
        identify_properties(dsin),
        select = c("VARIABLE", "MODE")
    )

    dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]
}



#' identify_mode_differences
#'
#' Identifies any mode differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @keywords internal
identify_mode_differences <- function(BASE, COMP) {
    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        suffixes = c(".BASE", ".COMP"),
        sort = TRUE
    )
    dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]

    dat[KEEP1 & KEEP2, , drop = FALSE]
}



#' identify_class_differences
#'
#' Identifies any class differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @keywords internal
identify_class_differences <- function(BASE, COMP) {
    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )

    dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- !mapply(
        identical,
        dat[["CLASS.BASE"]],
        dat[["CLASS.COMP"]]
    )

    dat[KEEP1 & KEEP2, , drop = FALSE]
}



#' Identify differences in attributes
#'
#' Identifies any attribute differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param exclude_cols Columns to exclude from comparison
#' @importFrom tibble tibble
#' @keywords internal
identify_att_differences <- function(BASE, COMP, exclude_cols = "") {
    matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)

    PROPS <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )

    PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))

    PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]


    ### Setup dummy return value
    RETURN <- tibble(
        VARIABLE = character(),
        ATTR_NAME = character(),
        VALUES.BASE = list(),
        VALUES.COMP = list()
    )

    for (i in PROPS[["VARIABLE"]]) {
        PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]

        ### Get a vector of all available attributes across both variables
        ATTRIB_NAMES <- unique(c(
            names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
            names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
        ))

        ### If variable has no attributes move onto the next variable
        if (is.null(ATTRIB_NAMES)) next()

        ### Loop over each attribute checking if they are identical and outputing
        ### anyones that arn't
        for (j in ATTRIB_NAMES) {
            ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
            ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]

            if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {
                ATT_DIFFS <- tibble(
                    VARIABLE = i,
                    ATTR_NAME = j,
                    VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
                    VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
                )

                RETURN <- rbind(RETURN, ATT_DIFFS)
            }
        }
    }
    return(RETURN)
}





#' identify_differences
#'
#' Compares each column within 2 datasets to identify any values which they
#' mismatch on.
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @param exclude_cols Columns to exclude from comparison
#' @param tolerance Level of tolerance for numeric differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
#' @keywords internal
identify_differences <- function(
    BASE,
    COMP,
    KEYS,
    exclude_cols,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {

    matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))

    if (length(matching_cols) == 0) {
        return(tibble())
    }

    DAT <- merge(
        x = BASE,
        y = COMP,
        by = KEYS,
        suffixes = c(".x", ".y"),
        sort = TRUE
    )
    if (nrow(DAT) == 0) {
        return(tibble())
    }
    DAT <- DAT[do.call("order", DAT[KEYS]), ]

    matching_list <- mapply(
        is_variable_different,
        matching_cols,
        MoreArgs = list(
            keynames = KEYS,
            datain = DAT,
            tolerance = tolerance,
            scale = scale
        ),
        SIMPLIFY = FALSE
    )

    matching_list
}









#' identify_properties
#'
#' Returns a dataframe of metadata for a given dataset.
#' Returned values include variable names , class , mode , type & attributes
#' @param dsin input dataframe that you want to get the metadata from
#' @importFrom tibble tibble
#' @keywords internal
identify_properties <- function(dsin) {
    ### If missing or null return empty dataset
    if (is.null(dsin)) {
        x <- tibble(
            VARIABLE = character(),
            CLASS = list(),
            MODE = character(),
            TYPE = character(),
            ATTRIBS = list()
        )
        return(x)
    }

    tibble(
        VARIABLE = names(dsin),
        CLASS = lapply(dsin, class),
        MODE = sapply(dsin, mode),
        TYPE = sapply(dsin, typeof),
        ATTRIBS = lapply(dsin, attributes)
    )
}


#' Find column ordering differences
#'
#' Compares two datasets and outputs a table listing any differences in the column
#' orders between the two datasets. Columns that are not contained within both
#' are ignored however column ordering is derived prior to removing these columns.
#'
#' @param BASE (`data.frame`)\cr Base dataset for comparison
#' @param COMP (`data.frame`)\cr Comparator dataset to compare base against
#' @keywords internal
identify_column_order_differences <- function(BASE, COMP) {
    base_cols <- tibble(
        COLUMN = names(BASE),
        "BASE-INDEX" = seq_along(names(BASE))
    )
    comp_cols <- tibble(
        COLUMN = names(COMP),
        "COMPARE-INDEX" = seq_along(names(COMP))
    )
    col_index <- merge(
        base_cols,
        comp_cols,
        by = c("COLUMN"),
        all = TRUE,
        sort = FALSE
    )
    keep_rows <- col_index[["BASE-INDEX"]] != col_index[["COMPARE-INDEX"]]
    keep_rows[is.na(keep_rows)] <- FALSE
    col_index[keep_rows, , drop = FALSE]
}

#' factor_to_character
#'
#' Takes a dataframe and converts any factor variables to character
#' @param dsin input dataframe
#' @param vars variables to consider for conversion. Default NULL will consider
#' every variable within the dataset
#' @keywords internal
factor_to_character <- function(dsin, vars = NULL) {
    if (is.null(vars)) vars <- names(dsin)

    for (var in vars) {
        if (is.factor(dsin[[var]])) {
            dsin[[var]] <- as.character(dsin[[var]])
        }
    }
    return(dsin)
}




#' has_unique_rows
#'
#' Check if a data sets rows are unique
#' @param DAT input data set (data frame)
#' @param KEYS Set of keys which should be unique
#' @keywords internal
has_unique_rows <- function(DAT, KEYS) {
    DUPS <- duplicated(subset(DAT, select = KEYS))
    NDUPS <- sum(DUPS)
    return(NDUPS == 0)
}

#' convert_to_issue
#'
#' converts the count value into the correct issue format
#' @param datin data inputted
#' @importFrom tibble rownames_to_column
#' @keywords internal
convert_to_issue <- function(datin) {
    datin_tibble <- tibble(
        `Variable` = names(datin),
        `No of Differences` = datin
    )

    datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
    return(datin_tibble_reduced)
}

#' Describe the datasets being compared
#'
#' This function is used to produce a basic summary table of the core
#' features of the two `data.frame`'s being compared.
#' @param base (`data.frame`)\cr base dataset to be described
#' @param comp (`data.frame`)\cr comparison dataset to be described
#' @param base_name (`character`)\cr name of the base dataset
#' @param comp_name (`character`)\cr name of the comparison dataset
#' @keywords internal
describe_dataframe <- function(base, comp, base_name, comp_name) {
    tibble(
        PROPERTY = list(
            "Name",
            "Class",
            "Rows(#)",
            "Columns(#)"
        ),
        BASE = c(
            base_name,
            paste(class(base), collapse = ", "),
            as.character(nrow(base)),
            as.character(ncol(base))
        ),
        COMP = c(
            comp_name,
            paste(class(comp), collapse = ", "),
            as.character(nrow(comp)),
            as.character(ncol(comp))
        )
    )
}

#' construct_issue
#'
#' Make an s3 object with class issue and possible additional class,
#' and assign other arguments to attributes
#' @param value the value of the object
#' @param message the value of the message attribute
#' @param add_class additional class to add
#' @keywords internal
construct_issue <- function(value, message, add_class = NULL) {
    x <- value

    ### If nothing has been provided return nothing !
    if (nrow(x) == 0) {
        return(NULL)
    }

    class(x) <- c(add_class, "issue", class(x))
    attributes(x)[["message"]] <- message
    return(x)
}


#' get_issue_message
#'
#' Simple function to grab the issue message
#' @param object inputted object of class issue
#' @param ... other arguments
#' @keywords internal
get_issue_message <- function(object, ...) {
    return(attr(object, "message"))
}


#' get_print_message
#'
#' Get the required text depending on type of issue
#' @param object inputted object of class issue
#' @param ... other arguments
#' @keywords internal
get_print_message <- function(object, ...) {
    UseMethod("get_print_message", object)
}


#' get_print_message.default
#'
#' Errors, as this should only ever be given an issue
#' @param object issue
#' @param ... Not used
#' @keywords internal
get_print_message.default <- function(object, ...) {
    stop("Error: An issue has not been provided to this function!")
}


#' get_print_message.issue
#'
#' Get text from a basic issue, based on the class of the value of the issue
#'
#' @param object an object of class issue_basic
#' @inheritParams print.diffdf
#' @keywords internal
get_print_message.issue <- function(object, row_limit, ...) {
    paste(
        c(
            attr(object, "message"),
            get_table(object, row_limit = row_limit),
            "\n\n"
        ),
        collapse = "\n"
    )
}

1		#' Generate unique key name
2		#'
3		#' Function to generate a name for the keys if not provided
4		#'
5		#' @param BASE base dataset
6		#' @param COMP comparison dataset
7		#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
8		#'
9		#' @keywords internal
10		generate_keyname <- function(
11		BASE,
12		COMP,
13		replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
14		) {
15	199x	assertthat::assert_that(
16	199x	is(replace_names, "character"),
17	199x	msg = "replace_names is not a character vector"
18		)
19
20	199x	assertthat::assert_that(
21	199x	length(replace_names) != 0,
22	199x	msg = "All default row names are in use in BASE/COMPARE. Please provide a KEY argument"
23		)
24
25
26	191x	key_name <- replace_names[1]
27
28	191x	if (!is.null(BASE[[key_name]]) \|\| !is.null(COMP[[key_name]])) {
29	66x	key_name <- generate_keyname(BASE, COMP, replace_names[-1])
30		}
31	159x	return(key_name)
32		}

1		#' Identify Issue Rows
2		#'
3		#' This function takes a `diffdf` object and a dataframe and subsets
4		#' the `data.frame` for problem rows as identified in the comparison object.
5		#' If \code{vars} has been specified only issue rows associated with those
6		#' variable(s) will be returned.
7		#' @param df dataframe to be subsetted
8		#' @param diff diffdf object
9		#' @param vars (optional) character vector containing names of issue variables to subset dataframe
10		#' on. A value of NULL (default) will be taken to mean available issue variables.
11		#' @examples
12		#' iris2 <- iris
13		#' for (i in 1:3) iris2[i, i] <- 99
14		#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
15		#' diffdf_issuerows(iris, x)
16		#' diffdf_issuerows(iris2, x)
17		#' diffdf_issuerows(iris2, x, vars = "Sepal.Length")
18		#' diffdf_issuerows(iris2, x, vars = c("Sepal.Length", "Sepal.Width"))
19		#' @details
20		#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only
21		#' requirement is that the original variables specified in the keys argument to diffdf
22		#' are present on the dataframe you are subsetting against. However please note that if
23		#' no keys were specified in diffdf then the row number is used. This means using
24		#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in
25		#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
26		#' @export
27		diffdf_issuerows <- function(df, diff, vars = NULL) {
28
29	19x	assertthat::assert_that(
30	19x	class(diff)[[1]] == "diffdf"
31		)
32
33
34	19x	KEYS_ATT <- attr(diff, "keys")
35
36	19x	assertthat::assert_that(
37	19x	!is.null(KEYS_ATT),
38	19x	msg = "diff is missing the keys attribute"
39		)
40
41
42	19x	issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]
43
44	19x	if (is.null(vars)) {
45	15x	vars <- issue_vars
46		} else {
47	4x	vars <- paste0("VarDiff_", vars)
48		}
49
50	19x	if (length(issue_vars) == 0 \|\| sum(vars %in% issue_vars) == 0) {
51	7x	return(df[FALSE, ])
52		}
53
54	12x	KEEP <- mapply(
55	12x	FUN = get_issue_dataset,
56	12x	issue = vars,
57	12x	diff = list(diff),
58	12x	SIMPLIFY = FALSE
59		)
60
61	12x	KEEP <- recursive_reduce(KEEP, rbind)
62	12x	KEEP <- KEEP[!duplicated(KEEP), ]
63
64	12x	if (KEYS_ATT$is_derived) {
65	2x	df[[KEYS_ATT$value]] <- seq_len(nrow(df))
66		}
67
68	12x	keys <- KEYS_ATT$value
69
70	12x	if (any(!keys %in% names(df))) {
71	1x	stop("df does not contain all variables specified as keys in diff")
72		}
73
74	11x	RET <- merge(
75	11x	x = df,
76	11x	y = KEEP,
77	11x	sort = TRUE
78		)
79
80	11x	RET <- RET[do.call("order", RET[keys]), ]
81
82	11x	if (KEYS_ATT$is_derived) {
83	2x	keep_vars <- !names(RET) %in% KEYS_ATT$value
84	2x	RET <- RET[, keep_vars, drop = FALSE]
85		}
86
87	11x	return(RET)
88		}
89
90
91
92
93		#' get_issue_dataset
94		#'
95		#' Internal function used by `diffdf_issuerows` to extract the dataframe
96		#' from each a target issue. In particular it also strips off any
97		#' non-key variables
98		#' @param issue name of issue to extract the dataset from diff
99		#' @param diff diffdf object which contains issues
100		#' @keywords internal
101		get_issue_dataset <- function(issue, diff) {
102	20x	issue_df <- diff[[issue]]
103	20x	keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
104	20x	issue_df[, keep, drop = FALSE]
105		}

1		#' Print diffdf objects
2		#'
3		#' Print a nicely formatted version of a diffdf object.
4		#'
5		#' @param x A comparison object created by \code{diffdf()}.
6		#' @param ... Additional arguments (not used).
7		#' @param row_limit Maximum number of rows to display in difference tables.
8		#' Use \code{NULL} to show all rows. Default is 10.
9		#' @param as_string Logical. If \code{TRUE}, returns the printed message as an R
10		#' character vector instead of printing to the console. Default is \code{FALSE}.
11		#' @param file A connection or a character string naming the file to print to. If
12		#' \code{NULL} (the default), output is printed to the console.
13		#'
14		#' @examples
15		#' x <- subset(iris, -Species)
16		#' x[1, 2] <- 5
17		#' COMPARE <- diffdf(iris, x)
18		#' print(COMPARE)
19		#' print(COMPARE, row_limit = 5)
20		#' \dontrun{
21		#' print(COMPARE, file = "output.txt")
22		#' }
23		#'
24		#' @export
25		print.diffdf <- function(x, row_limit = 10, as_string = FALSE, file = NULL, ...) {
26	33x	if (!is.null(row_limit)) {
27	32x	assertthat::assert_that(
28	32x	assertthat::is.number(row_limit),
29	32x	row_limit > 0,
30	32x	msg = "row_limit must be a positive integer"
31		)
32		}
33	29x	assertthat::assert_that(
34	29x	assertthat::is.flag(as_string)
35		)
36	27x	COMPARE <- x
37
38	27x	if (length(COMPARE) == 0) {
39	6x	outtext <- "No issues were found!\n"
40		} else {
41	21x	start_text <- paste0("Differences found between the objects!\n\n")
42	21x	end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit))
43	21x	end_text <- paste0(unlist(end_text), collapse = "")
44	21x	outtext <- paste0(start_text, end_text)
45		}
46
47	27x	string_content <- strsplit(outtext, "\n")[[1]]
48	27x	if (!is.null(file)) {
49	3x	tryCatch(
50		{
51	3x	sink(file)
52	3x	cat(string_content, sep = "\n")
53	3x	sink()
54		},
55	3x	warning = function(w) {
56	!	sink()
57	!	warning(w)
58		},
59	3x	error = function(e) {
60	!	sink()
61	!	stop(e)
62		}
63		)
64	3x	return(invisible(COMPARE))
65		}
66
67	24x	if (as_string) {
68	9x	return(string_content)
69		} else {
70	15x	cat(outtext)
71	15x	return(invisible(COMPARE))
72		}
73		}

1		#' sort_then_join
2		#'
3		#' Convenience function to sort two strings and paste them together
4		#' @param string1 first string
5		#' @param string2 second string
6		#' @keywords internal
7		sort_then_join <- function(string1, string2) {
8	14x	paste0(sort(c(string1, string2)), collapse = "")
9		}
10
11
12		#' class_merge
13		#'
14		#' Convenience function to put all classes an object has into one string
15		#' @param x an object
16		#' @keywords internal
17		class_merge <- function(x) {
18	304x	paste(class(x), collapse = "_")
19		}
20
21
22		get_message <- function(colname, whichdat, totype) {
23	12x	message(paste0(
24	12x	"NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype
25		))
26		}
27
28
29		#' get_casted_vector
30		#'
31		#' casts a vector depending on its type and input
32		#' @param colin column to cast
33		#' @param colname name of vector
34		#' @param whichdat whether base or compare is being casted (used for messages)
35		#' @importFrom methods is
36		#' @keywords internal
37		get_casted_vector <- function(colin, colname, whichdat) {
38	24x	if (is(colin, "factor")) {
39	6x	get_message(colname, whichdat, "character")
40	6x	return(as.character(colin))
41		}
42
43	18x	if (is(colin, "integer")) {
44	6x	get_message(colname, whichdat, "numeric")
45	6x	return(as.numeric(colin))
46		}
47
48	12x	colin
49		}
50
51
52
53		#' get_casted_dataset
54		#'
55		#' Internal utility function to loop across a dataset casting all target
56		#' variables
57		#' @param df dataset to be casted
58		#' @param columns columns to be casted
59		#' @param whichdat whether base or compare is being casted (used for messages)
60		#' @keywords internal
61		get_casted_dataset <- function(df, columns, whichdat) {
62	22x	for (col in columns) {
63	24x	df[[col]] <- get_casted_vector(df[[col]], col, whichdat)
64		}
65	22x	return(df)
66		}
67
68
69
70
71
72
73		#' cast_variables
74		#'
75		#' Function to cast datasets columns if they have differing types
76		#' Restricted to specific cases, currently integer and double, and character and factor
77		#' @param BASE base dataset
78		#' @param COMPARE comparison dataset
79		#' @param ignore_vars Variables not to be considered for casting
80		#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
81		#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
82		#' @keywords internal
83		cast_variables <- function(
84		BASE,
85		COMPARE,
86		ignore_vars = NULL,
87		cast_integers = FALSE,
88		cast_factors = FALSE
89		) {
90
91	13x	allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]
92
93	13x	BASE_class <- data.frame(
94	13x	class_BASE = sapply(BASE, class_merge),
95	13x	colname = names(BASE),
96	13x	stringsAsFactors = FALSE
97		)
98	13x	BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE]
99
100
101	13x	COMPARE_class <- data.frame(
102	13x	class_COMPARE = sapply(COMPARE, class_merge),
103	13x	colname = names(COMPARE),
104	13x	stringsAsFactors = FALSE
105		)
106	13x	COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE]
107
108	13x	common_class <- merge(
109	13x	x = BASE_class,
110	13x	y = COMPARE_class,
111	13x	by = "colname"
112		)
113
114
115	13x	diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE]
116
117
118	13x	diff_class$classmerge <- mapply(
119	13x	sort_then_join,
120	13x	diff_class$class_COMPARE,
121	13x	diff_class$class_BASE
122		)
123
124
125	13x	cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE]
126
127
128	13x	DATASETS <- list(
129	13x	"BASE" = BASE,
130	13x	"COMPARE" = COMPARE
131		)
132
133	13x	if (nrow(cast_columns) == 0) {
134	2x	return(DATASETS)
135		}
136
137
138	11x	for (i in names(DATASETS)) {
139	22x	DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i)
140		}
141
142	11x	return(DATASETS)
143		}

1		#' is_variable_different
2		#'
3		#' This subsets the data set on the variable name, picks out differences and returns a `tibble`
4		#' of differences for the given variable
5		#' @importFrom tibble as_tibble
6		#' @param variablename name of variable being compared
7		#' @param keynames name of keys
8		#' @param datain Inputted dataset with base and compare vectors
9		#' @param ... Additional arguments which might be passed through (numerical accuracy)
10		#' @return A boolean vector which is T if target and current are different
11		#' @keywords internal
12		is_variable_different <- function(variablename, keynames, datain, ...) {
13	931x	xvar <- paste0(variablename, ".x")
14	931x	yvar <- paste0(variablename, ".y")
15
16	931x	assertthat::assert_that(
17	931x	xvar %in% names(datain) && yvar %in% names(datain),
18	931x	msg = "Variable does not exist within input dataset"
19		)
20
21	931x	target <- datain[[xvar]]
22	931x	current <- datain[[yvar]]
23	931x	outvect <- find_difference(target, current, ...)
24
25	931x	datain[["VARIABLE"]] <- variablename
26
27	931x	names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
28
29	931x	x <- as_tibble(
30	931x	subset(
31	931x	datain,
32	931x	outvect,
33	931x	select = c("VARIABLE", keynames, "BASE", "COMPARE")
34		)
35		)
36
37	931x	return(x)
38		}
39
40		#' compare_vectors
41		#'
42		#' Compare two vectors looking for differences
43		#'
44		#' @param target the base vector
45		#' @param current a vector to compare target to
46		#' @param ... Additional arguments which might be passed through (numerical accuracy)
47		#' @keywords internal
48		compare_vectors <- function(target, current, ...) {
49	964x	UseMethod("compare_vectors")
50		}
51
52
53		#' find_difference
54		#'
55		#' This determines if two vectors are different. It expects vectors of the same
56		#' length and type, and is intended to be used after checks have already been done
57		#' Initially picks out any `NA`'s (matching `NA`'s count as a match)
58		#' Then compares remaining vector
59		#'
60		#' @param target the base vector
61		#' @param current a vector to compare target to
62		#' @param ... Additional arguments which might be passed through (numerical accuracy)
63		#' @keywords internal
64		find_difference <- function(target, current, ...) {
65	967x	if (length(target) != length(current)) {
66	2x	warning("Inputs are not of the same length")
67	2x	return(NULL)
68		}
69
70	965x	if (is.null(target) \|\| is.null(current)) {
71	1x	return(is.null(target) != is.null(current))
72		}
73
74		### Initalise output, assume problem unless evidence otherwise
75	964x	return_vector <- rep(TRUE, length(target))
76
77	964x	nas_t <- is.na(target)
78	964x	nas_c <- is.na(current)
79
80		## compare missing values
81	964x	nacompare <- nas_t != nas_c
82	964x	naselect <- nas_t \| nas_c
83	964x	return_vector[naselect] <- nacompare[naselect]
84
85		## compare non-missing values
86	964x	selectvector <- as.logical((!nas_t) * (!nas_c))
87
88	964x	comparevect <- compare_vectors(
89	964x	target[selectvector],
90	964x	current[selectvector],
91		...
92		)
93
94	964x	return_vector[selectvector] <- comparevect
95
96	964x	return(return_vector)
97		}
98
99
100
101
102
103
104
105		#' compare_vectors.default
106		#'
107		#' Default method, if the vector is not numeric or factor. Basic comparison
108		#' @param target the base vector
109		#' @param current a vector to compare target to
110		#' @param ... Additional arguments which might be passed through (numerical accuracy)
111		#' @keywords internal
112		compare_vectors.default <- function(target, current, ...) {
113	411x	target != current
114		}
115
116
117
118
119		#' compare_vectors.factor
120		#'
121		#' Compares factors. Sets them as character and then compares
122		#' @param target the base vector
123		#' @param current a vector to compare target to
124		#' @param ... Additional arguments which might be passed through (numerical accuracy)
125		#' @keywords internal
126		compare_vectors.factor <- function(target, current, ...) {
127	83x	as.character(target) != as.character(current)
128		}
129
130
131
132
133
134		#' compare_vectors.numeric
135		#'
136		#' This is a modified version of the all.equal function
137		#' which returns a vector rather than a message
138		#' @param target the base vector
139		#' @param current a vector to compare target to
140		#' @param tolerance Level of tolerance for differences between two variables
141		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
142		#' @param ... Not used
143		#' @keywords internal
144		compare_vectors.numeric <- function(
145		target,
146		current,
147		tolerance = sqrt(.Machine$double.eps),
148		scale = NULL,
149		...
150		) {
151
152	470x	out <- target == current
153
154	470x	if (all(out)) {
155	439x	return(!out)
156		}
157
158	31x	if (is.integer(target) \|\| is.integer(current)) {
159	7x	target <- as.double(target)
160	7x	current <- as.double(current)
161		}
162
163	31x	xy <- abs(target - current)
164
165	31x	if (!is.null(scale)) {
166	4x	xy <- xy / scale
167		}
168
169	31x	return(xy > tolerance)
170		}
171
172		#' compare_vectors.int64
173		#'
174		#' Handle int64 vectors. Uses numeric comparison
175		#' @param target the base vector
176		#' @param current a vector to compare target to
177		#' @param tolerance Level of tolerance for differences between two variables
178		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
179		#' @param ... Not used
180		#' @keywords internal
181		compare_vectors.integer64 <- function(
182		target,
183		current,
184		tolerance = sqrt(.Machine$double.eps),
185		scale = NULL,
186		...
187		) {
188	4x	compare_vectors.numeric(target, current, tolerance, scale)
189		}

1		#' diffdf
2		#' @description
3		#' Compares 2 dataframes and outputs any differences.
4		#' @param base input dataframe
5		#' @param compare comparison dataframe
6		#' @param keys vector of variables (as strings) that defines a unique row in
7		#' the base and compare dataframes
8		#' @param strict_numeric Flag for strict numeric to numeric comparisons
9		#' (default = TRUE). If False diffdf will cast integer to double where
10		#' required for comparisons. Note that variables specified in the keys
11		#' will never be casted.
12		#' @param strict_factor Flag for strict factor to character comparisons
13		#' (default = TRUE). If False diffdf will cast factors to characters where
14		#' required for comparisons. Note that variables specified in the keys will
15		#' never be casted.
16		#' @param suppress_warnings Do you want to suppress warnings? (logical)
17		#' @param file Location and name of a text file to output the results to.
18		#' Setting to NULL will cause no file to be produced.
19		#' @param tolerance Set tolerance for numeric comparisons. Note that
20		#' comparisons fail if (x-y)/scale > tolerance.
21		#' @param scale Set scale for numeric comparisons. Note that comparisons fail
22		#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
23		#' version of scale = 1.
24		#' @param check_column_order Should the column ordering be checked? (logical)
25		#' @param check_df_class Do you want to check for differences in the class
26		#' between `base` and `compare`? (logical)
27		#' @examples
28		#' x <- subset(iris, -Species)
29		#' x[1, 2] <- 5
30		#' COMPARE <- diffdf(iris, x)
31		#' print(COMPARE)
32		#'
33		#' #### Sample data frames
34		#'
35		#' DF1 <- data.frame(
36		#' id = c(1, 2, 3, 4, 5, 6),
37		#' v1 = letters[1:6],
38		#' v2 = c(NA, NA, 1, 2, 3, NA)
39		#' )
40		#'
41		#' DF2 <- data.frame(
42		#' id = c(1, 2, 3, 4, 5, 7),
43		#' v1 = letters[1:6],
44		#' v2 = c(NA, NA, 1, 2, NA, NA),
45		#' v3 = c(NA, NA, 1, 2, NA, 4)
46		#' )
47		#'
48		#' diffdf(DF1, DF1, keys = "id")
49		#'
50		#' # We can control matching with scale/location for example:
51		#'
52		#' DF1 <- data.frame(
53		#' id = c(1, 2, 3, 4, 5, 6),
54		#' v1 = letters[1:6],
55		#' v2 = c(1, 2, 3, 4, 5, 6)
56		#' )
57		#' DF2 <- data.frame(
58		#' id = c(1, 2, 3, 4, 5, 6),
59		#' v1 = letters[1:6],
60		#' v2 = c(1.1, 2, 3, 4, 5, 6)
61		#' )
62		#'
63		#' diffdf(DF1, DF2, keys = "id")
64		#' diffdf(DF1, DF2, keys = "id", tolerance = 0.2)
65		#' diffdf(DF1, DF2, keys = "id", scale = 10, tolerance = 0.2)
66		#'
67		#' # We can use strict_factor to compare factors with characters for example:
68		#'
69		#' DF1 <- data.frame(
70		#' id = c(1, 2, 3, 4, 5, 6),
71		#' v1 = letters[1:6],
72		#' v2 = c(NA, NA, 1, 2, 3, NA),
73		#' stringsAsFactors = FALSE
74		#' )
75		#'
76		#' DF2 <- data.frame(
77		#' id = c(1, 2, 3, 4, 5, 6),
78		#' v1 = letters[1:6],
79		#' v2 = c(NA, NA, 1, 2, 3, NA)
80		#' )
81		#'
82		#' diffdf(DF1, DF2, keys = "id", strict_factor = TRUE)
83		#' diffdf(DF1, DF2, keys = "id", strict_factor = FALSE)
84		#'
85		#' @export
86		diffdf <- function(
87		base,
88		compare,
89		keys = NULL,
90		suppress_warnings = FALSE,
91		strict_numeric = TRUE,
92		strict_factor = TRUE,
93		file = NULL,
94		tolerance = sqrt(.Machine$double.eps),
95		scale = NULL,
96		check_column_order = FALSE,
97		check_df_class = FALSE
98		) {
99
100	131x	assertthat::assert_that(
101	131x	assertthat::is.flag(check_df_class),
102	131x	!is.na(check_df_class),
103	131x	msg = "`check_df_class` must be a length 1 logical"
104		)
105
106	131x	BASE <- base
107	131x	COMP <- compare
108	131x	KEYS <- keys
109	131x	SUPWARN <- suppress_warnings
110
111		### Initatiate output object
112	131x	COMPARE <- list()
113	131x	class(COMPARE) <- c("diffdf", "list")
114
115
116	131x	BASE_NAME <- deparse(substitute(base), width.cutoff = 30L)
117	131x	if (length(BASE_NAME) > 1) {
118	2x	BASE_NAME <- "<BASE>"
119		}
120	131x	COMP_NAME <- deparse(substitute(compare), width.cutoff = 30L)
121	131x	if (length(COMP_NAME) > 1) {
122	!	COMP_NAME <- "<COMPARE>"
123		}
124
125	131x	COMPARE[["DataSummary"]] <- construct_issue(
126	131x	value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
127	131x	message = "Summary of BASE and COMPARE"
128		)
129
130
131	131x	is_derived <- FALSE
132
133		### If no key is suplied match values based upon row number
134	131x	if (is.null(KEYS)) {
135	108x	is_derived <- TRUE
136	108x	keyname <- generate_keyname(BASE, COMP)
137	108x	BASE[[keyname]] <- seq_len(nrow(BASE))
138	108x	COMP[[keyname]] <- seq_len(nrow(COMP))
139	108x	KEYS <- keyname
140		}
141	131x	attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)
142
143	131x	assertthat::assert_that(
144	131x	is.numeric(tolerance),
145	131x	is.numeric(scale) \|\| is.null(scale)
146		)
147
148	127x	missing_keys_base <- KEYS[!KEYS %in% names(BASE)]
149	127x	assertthat::assert_that(
150	127x	length(missing_keys_base) == 0,
151	127x	msg = sprintf(
152	127x	"The following KEYS are not available in BASE:\n %s",
153	127x	paste(missing_keys_base, collapse = "\n ")
154		)
155		)
156
157	126x	missing_keys_comp <- KEYS[!KEYS %in% names(COMP)]
158	126x	assertthat::assert_that(
159	126x	length(missing_keys_comp) == 0,
160	126x	msg = sprintf(
161	126x	"The following KEYS are not available in COMPARE:\n %s",
162	126x	paste(missing_keys_comp, collapse = "\n ")
163		)
164		)
165
166	125x	assertthat::assert_that(
167	125x	has_unique_rows(BASE, KEYS),
168	125x	msg = "BY variables in BASE do not result in unique observations"
169		)
170
171	124x	assertthat::assert_that(
172	124x	has_unique_rows(COMP, KEYS),
173	124x	msg = "BY variables in COMPARE do not result in unique observations"
174		)
175
176
177
178		#### Check essential variable properties (class & mode)
179
180	124x	COMPARE[["UnsupportedColsBase"]] <- construct_issue(
181	124x	value = identify_unsupported_cols(BASE),
182	124x	message = "There are columns in BASE with unsupported modes !!"
183		)
184
185
186	124x	COMPARE[["UnsupportedColsComp"]] <- construct_issue(
187	124x	value = identify_unsupported_cols(COMP),
188	124x	message = "There are columns in COMPARE with unsupported modes !!"
189		)
190
191
192		# cast variables if strict is off
193	124x	if (!strict_factor \|\| !strict_numeric) {
194	9x	casted_df <- cast_variables(
195	9x	BASE = BASE,
196	9x	COMPARE = COMP,
197	9x	ignore_vars = KEYS,
198	9x	cast_integers = !strict_numeric,
199	9x	cast_factors = !strict_factor
200		)
201
202	9x	BASE <- casted_df$BASE
203	9x	COMP <- casted_df$COMP
204		}
205
206
207	124x	COMPARE[["VarModeDiffs"]] <- construct_issue(
208	124x	value = identify_mode_differences(BASE, COMP),
209	124x	message = "There are columns in BASE and COMPARE with different modes !!"
210		)
211
212
213	124x	COMPARE[["VarClassDiffs"]] <- construct_issue(
214	124x	value = identify_class_differences(BASE, COMP),
215	124x	message = "There are columns in BASE and COMPARE with different classes !!"
216		)
217
218
219
220
221		##### Check Validity of Keys
222
223	124x	BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
224	124x	COMP_keys <- names(COMP)[names(COMP) %in% KEYS]
225
226	124x	assertthat::assert_that(
227	124x	length(BASE_keys) == length(KEYS),
228	124x	msg = "BASE is missing variables specified in KEYS"
229		)
230
231	124x	assertthat::assert_that(
232	124x	length(COMP_keys) == length(KEYS),
233	124x	msg = "COMP is missing variables specified in KEYS"
234		)
235
236
237	124x	assert_valid_keys(
238	124x	COMPARE, KEYS, "UnsupportedColsBase",
239	124x	"The following KEYS in BASE have an unsupported mode (see `?mode()`)"
240		)
241	123x	assert_valid_keys(
242	123x	COMPARE, KEYS, "UnsupportedColsComp",
243	123x	"The following KEYS in COMPARE have an unsupported mode (see `?mode()`)"
244		)
245	122x	assert_valid_keys(
246	122x	COMPARE, KEYS, "VarModeDiffs",
247	122x	"The following KEYS have different modes between BASE and COMPARE"
248		)
249	120x	assert_valid_keys(
250	120x	COMPARE, KEYS, "VarClassDiffs",
251	120x	"The following KEYS have different classes between BASE and COMPARE"
252		)
253
254
255	119x	exclude_cols <- c(
256	119x	COMPARE[["UnsupportedColsBase"]]$VARIABLE,
257	119x	COMPARE[["UnsupportedColsComp"]]$VARIABLE,
258	119x	COMPARE[["VarClassDiffs"]]$VARIABLE,
259	119x	COMPARE[["VarModeDiffs"]]$VARIABLE
260		)
261
262	119x	if (check_column_order) {
263	5x	if (attr(COMPARE, "keys")$is_derived) {
264	4x	keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
265	4x	keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
266		} else {
267	1x	keep_vars_base <- TRUE
268	1x	keep_vars_comp <- TRUE
269		}
270	5x	COMPARE[["ColumnOrder"]] <- construct_issue(
271	5x	value = identify_column_order_differences(
272	5x	BASE[, keep_vars_base, drop = FALSE],
273	5x	COMP[, keep_vars_comp, drop = FALSE]
274		),
275	5x	message = "There are differences in the column ordering between BASE and COMPARE !!"
276		)
277		}
278
279
280		##### Check Attributes
281	119x	COMPARE[["AttribDiffs"]] <- construct_issue(
282	119x	value = identify_att_differences(BASE, COMP, exclude_cols),
283	119x	message = "There are columns in BASE and COMPARE with differing attributes !!"
284		)
285
286
287		##### Check data
288
289	119x	BASE <- factor_to_character(BASE, KEYS)
290	119x	COMP <- factor_to_character(COMP, KEYS)
291
292
293	119x	COMPARE[["ExtRowsBase"]] <- construct_issue(
294	119x	value = identify_extra_rows(BASE, COMP, KEYS),
295	119x	message = "There are rows in BASE that are not in COMPARE !!"
296		)
297
298
299	119x	COMPARE[["ExtRowsComp"]] <- construct_issue(
300	119x	value = identify_extra_rows(COMP, BASE, KEYS),
301	119x	message = "There are rows in COMPARE that are not in BASE !!"
302		)
303
304
305	119x	COMPARE[["ExtColsBase"]] <- construct_issue(
306	119x	value = identify_extra_cols(BASE, COMP),
307	119x	message = "There are columns in BASE that are not in COMPARE !!"
308		)
309
310
311	119x	COMPARE[["ExtColsComp"]] <- construct_issue(
312	119x	value = identify_extra_cols(COMP, BASE),
313	119x	message = "There are columns in COMPARE that are not in BASE !!"
314		)
315
316
317	119x	VALUE_DIFFERENCES <- identify_differences(
318	119x	BASE, COMP, KEYS, exclude_cols,
319	119x	tolerance = tolerance,
320	119x	scale = scale
321		)
322
323
324
325		## Summarise the number of mismatching rows per variable
326
327	119x	if (length(VALUE_DIFFERENCES)) {
328	113x	NDIFF <- sapply(VALUE_DIFFERENCES, nrow)
329	113x	COMPARE[["NumDiff"]] <- construct_issue(
330	113x	value = convert_to_issue(NDIFF),
331	113x	message = "Not all Values Compared Equal"
332		)
333		}
334
335
336	119x	for (i in names(VALUE_DIFFERENCES)) {
337	931x	COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
338	931x	value = VALUE_DIFFERENCES[[i]],
339	931x	message = NULL
340		)
341		}
342
343
344		# suppress warning message of data summary if user didn't request to check it
345		# we leave the issue in the main compare object though for printing purposes
346	119x	COMPARE_WARNINGS <- COMPARE
347	119x	attr(COMPARE_WARNINGS[["DataSummary"]], "message") <- c(
348	119x	"There are differences between the class of BASE and COMPARE"
349		)
350	119x	if (!check_df_class \|\| identical(class(base), class(compare))) {
351	117x	COMPARE_WARNINGS["DataSummary"] <- NULL
352		}
353
354		# Get all issue messages, remove blank message, and collapse into single string
355	119x	ISSUE_MSGS <- sapply(COMPARE_WARNINGS, function(x) get_issue_message(x))
356	119x	ISSUE_MSGS <- Filter(function(x) !is.null(x), ISSUE_MSGS)
357	119x	ISSUE_MSGS <- Filter(function(x) x != "", ISSUE_MSGS)
358
359	119x	if (length(ISSUE_MSGS) != 0) {
360	80x	if (!SUPWARN) {
361	45x	ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
362	45x	warning(c("\n", ISSUE_MSGS))
363		}
364		}
365
366		# If the classes are the same and it is the only entry in the compare
367		# object then remove it in order to trigger "no issues found"
368	119x	if (identical(class(base), class(compare)) && length(COMPARE) == 1) {
369	38x	COMPARE["DataSummary"] <- NULL
370		}
371
372		# If the summary is the only item and the user didn't want to check classes
373		# then remove the object to trigger the "no issues found"
374	119x	if (!check_df_class && length(COMPARE) == 1) {
375	1x	COMPARE["DataSummary"] <- NULL
376		}
377
378
379	119x	if (!is.null(file)) {
380	1x	print(COMPARE, file = file)
381		}
382
383	119x	return(COMPARE)
384		}
385
386
387
388
389		#' diffdf_has_issues
390		#'
391		#' Utility function which returns TRUE if an diffdf
392		#' object has issues or FALSE if an diffdf object does not have issues
393		#' @param x diffdf object
394		#' @examples
395		#'
396		#' # Example with no issues
397		#' x <- diffdf(iris, iris)
398		#' diffdf_has_issues(x)
399		#'
400		#' # Example with issues
401		#' iris2 <- iris
402		#' iris2[2, 2] <- NA
403		#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
404		#' diffdf_has_issues(x)
405		#' @export
406		diffdf_has_issues <- function(x) {
407	!	if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
408	3x	length(x) != 0
409		}
410
411
412		#' Assert that keys are valid
413		#'
414		#' Utility function to check that user provided "keys" aren't listed as a problem
415		#' variable of the current list of issues.
416		#' @param COMPARE (`list`)\cr A named list of which each element is a `data.frame` with the
417		#' column `VARIABLE`
418		#' @param KEYS (`character`)\cr name of key variables to check to make sure they don't contain
419		#' any issues
420		#' @param component (`character`)\cr name of the component within `COMPARE` to check against
421		#' @param msg (`character`)\cr error message to print if any of `KEYS` are found within
422		#' `COMPARE[component]$VARIABLE`
423		#' @keywords internal
424		assert_valid_keys <- function(COMPARE, KEYS, component, msg) {
425	489x	keys_reduced <- KEYS[KEYS %in% COMPARE[[component]]$VARIABLE]
426	489x	assertthat::assert_that(
427	489x	length(keys_reduced) == 0,
428	489x	msg = sprintf(
429	489x	"%s:\n%s",
430	489x	msg,
431	489x	paste0("`", paste0(keys_reduced, collapse = "`, `"), "`")
432		)
433		)
434		}

1		#' Pad String
2		#'
3		#' Utility function used to replicate `str_pad`. Adds white space to either end
4		#' of a string to get it to equal the desired length
5		#' @param x string
6		#' @param width desired length
7		#' @keywords internal
8		string_pad <- function(x, width) {
9	1295x	if (nchar(x) >= width) {
10	!	return(x)
11		}
12	1295x	width <- width - nchar(x)
13	1295x	left <- paste0(rep(" ", floor(width / 2)), collapse = "")
14	1295x	right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
15	1295x	paste0(left, x, right, collapse = "")
16		}
17
18
19		#' recursive_reduce
20		#'
21		#' Utility function used to replicated `purrr::reduce`. Recursively applies a
22		#' function to a list of elements until only 1 element remains
23		#' @param .l list of values to apply a function to
24		#' @param .f function to apply to each each element of the list in turn. See details.
25		#' @details
26		#' This function is essentially performing the following operation:
27		#' ```
28		#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
29		#' ```
30		#' @keywords internal
31		recursive_reduce <- function(.l, .f) {
32	587x	if (length(.l) != 1) {
33	386x	.l[[2]] <- .f(.l[[1]], .l[[2]])
34	386x	return(recursive_reduce(.l[-1], .f))
35		}
36	201x	.l[[1]]
37		}
38
39		#' invert
40		#'
41		#' Utility function used to replicated `purrr::transpose`. Turns a list inside
42		#' out.
43		#' @param x list
44		#' @keywords internal
45		invert <- function(x) {
46	63x	x2 <- list()
47	63x	cnames <- names(x)
48	63x	tnames <- names(x[[1]])
49	63x	for (i in tnames) {
50	189x	x2[[i]] <- list()
51	189x	for (j in cnames) {
52	567x	x2[[i]][[j]] <- x[[j]][[i]]
53		}
54		}
55	63x	x2
56		}
57
58
59
60
61		#' as_ascii_table
62		#'
63		#' This function takes a `data.frame` and attempts to convert it into
64		#' a simple ascii format suitable for printing to the screen
65		#' It is assumed all variable values have a `as.character()` method
66		#' in order to cast them to character.
67		#' @param dat Input dataset to convert into a ascii table
68		#' @param line_prefix Symbols to prefix in front of every line of the table
69		#' @keywords internal
70		as_ascii_table <- function(dat, line_prefix = " ") {
71	63x	n_col <- ncol(dat)
72	63x	n_row <- nrow(dat)
73
74		## Convert every value to character and crop to a suitable length
75	63x	dat_char <- lapply(dat, as_fmt_char)
76
77
78	63x	hold <- list()
79	63x	COLS <- colnames(dat)
80
81		### For each column extract core elements (width, values , title) and pad out
82		### each string to be a suitable length
83	63x	for (i in seq_len(n_col)) {
84	189x	COL <- COLS[i]
85	189x	VALUES <- dat_char[[i]]
86
87	189x	JOINT <- c(COL, VALUES)
88	189x	WIDTH <- max(sapply(JOINT, nchar)) + 2
89
90	189x	hold[[COL]] <- list()
91	189x	hold[[COL]]$WIDTH <- WIDTH
92	189x	hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH)
93	189x	hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH)
94		}
95
96		### Collapse into a single value per component ( title , values, width )
97	63x	thold <- invert(hold)
98	63x	tvals <- recursive_reduce(thold$VALUES, paste0)
99	63x	thead <- recursive_reduce(thold$HEADER, paste0)
100	63x	twidth <- recursive_reduce(thold$WIDTH, sum)
101
102		### Create header and footer lines
103	63x	TLINE <- paste0(rep("=", twidth), collapse = "")
104	63x	LINE <- paste0(rep("-", twidth), collapse = "")
105	63x	FVALS <- paste0(line_prefix, tvals, collapse = "\n")
106
107		### Output table
108	63x	paste0(
109	63x	line_prefix, TLINE, "\n",
110	63x	line_prefix, thead, "\n",
111	63x	line_prefix, LINE, "\n",
112	63x	FVALS, "\n",
113	63x	line_prefix, LINE
114		)
115		}
116
117
118		#' as_character
119		#'
120		#' Stub function to enable mocking in unit tests
121		as_character <- as.character
122
123		#' Format vector to printable string
124		#'
125		#' Coerces a vector of any type into a printable string. The most
126		#' significant transformation is performed on existing character
127		#' vectors which will be truncated, have newlines converted
128		#' to explicit symbols and will be wrapped in quotes if they
129		#' contain white space.
130		#'
131		#' @param x (`vector`) \cr vector to be converted to character
132		#' @param add_quotes (`logical`) \cr if true will wrap strings that contain
133		#' whitespace with quotes
134		#' @param crop_at (`numeric`) \cr specifies the limit at which strings should
135		#' be truncated to
136		#' @param ... additional arguments (not currently used)
137		#'
138		#' @name as_fmt_char
139		#' @keywords internal
140		as_fmt_char <- function(x, ...) {
141	292x	UseMethod("as_fmt_char")
142		}
143
144		#' @rdname as_fmt_char
145		#' @export
146		as_fmt_char.numeric <- function(x, ...) {
147	56x	format(x, digits = 7, justify = "right")
148		}
149
150		#' @rdname as_fmt_char
151		#' @export
152		as_fmt_char.NULL <- function(x, ...) {
153	!	"<NULL>"
154		}
155
156		#' @importFrom utils capture.output
157		#' @rdname as_fmt_char
158		#' @export
159		as_fmt_char.list <- function(x, ...) {
160	29x	vapply(
161	29x	x,
162	29x	function(x) {
163	92x	if (is.numeric(x)) {
164	!	return(as_fmt_char(x))
165		}
166	92x	if (is.character(x) & length(x) == 1) {
167	86x	return(as_fmt_char(x))
168		}
169	6x	as_fmt_char(
170	6x	paste(capture.output(dput(x)), collapse = " "),
171	6x	add_quotes = FALSE
172		)
173		},
174	29x	character(1)
175		)
176		}
177
178		#' @rdname as_fmt_char
179		#' @export
180		as_fmt_char.factor <- function(x, ...) {
181	3x	as_fmt_char(as.character(x))
182		}
183
184		#' @rdname as_fmt_char
185		#' @export
186		as_fmt_char.character <- function(x, add_quotes = TRUE, crop_at = 30, ...) {
187	197x	needs_quotes <- grepl("\\s", x) & add_quotes
188
189	197x	x[is.na(x)] <- "<NA>"
190
191		# Replace \nl \cr with tags to stop print message splitting over
192		# multiple lines
193	197x	x <- gsub("\x0D", "<cr>", x)
194	197x	x <- gsub("\x0A", "<nl>", x)
195
196	197x	charlength <- vapply(x, nchar, numeric(1))
197	197x	x <- substr(x, 1, crop_at)
198	197x	x[charlength > crop_at] <- paste0(x[charlength > crop_at], "...")
199
200		# Add enclosing " " around strings with white space so that it can be
201		# clearly identified in the printed output
202	197x	x[needs_quotes] <- paste0('"', x[needs_quotes], '"')
203
204	197x	x
205		}
206
207
208		#' @rdname as_fmt_char
209		#' @export
210		as_fmt_char.default <- function(x, ...) {
211	8x	x_char <- as_character(x)
212	8x	assertthat::assert_that(
213	8x	is.character(x_char),
214	8x	msg = sprintf(
215	8x	"Unable to convert class `'%s'` to character for printing purposes",
216	8x	paste(class(x), collapse = "', '")
217		)
218		)
219	7x	as_fmt_char.character(x_char, add_quotes = FALSE)
220		}
221
222
223		#' @rdname as_fmt_char
224		#' @export
225		as_fmt_char.POSIXt <- function(x, ...) {
226	6x	x <- format(x, "%Y-%m-%d %H:%M:%S %Z")
227	6x	x[is.na(x)] <- "<NA>"
228	6x	x
229		}
230
231
232		#' get_table
233		#'
234		#' Generate nice looking table from a data frame
235		#' @param dsin dataset
236		#' @inheritParams print.diffdf
237		#' @keywords internal
238		get_table <- function(dsin, row_limit = 10) {
239	60x	if (nrow(dsin) == 0) {
240	!	return("")
241		}
242	60x	if (!is.null(row_limit)) {
243	57x	assertthat::assert_that(
244	57x	assertthat::is.number(row_limit),
245	57x	row_limit > 0,
246	57x	msg = "row_limit must be a positive integer"
247		)
248		}
249	60x	if (is.null(row_limit)) {
250	3x	display_table <- dsin
251		} else {
252	57x	display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1))
253		}
254
255	60x	add_message <- if (!is.null(row_limit) && nrow(dsin) > row_limit) {
256	7x	paste0(
257	7x	"First ",
258	7x	row_limit,
259	7x	" of ",
260	7x	nrow(dsin),
261	7x	" rows are shown in table below"
262		)
263		} else {
264	53x	NULL
265		}
266
267	60x	msg <- paste(
268	60x	c(
269	60x	add_message,
270	60x	as_ascii_table(display_table)
271		),
272	60x	collapse = "\n"
273		)
274	60x	msg
275		}

1		#' identify_extra_rows
2		#'
3		#' Identifies rows that are in a baseline dataset but not in a comparator dataset
4		#' @param DS1 Baseline dataset (data frame)
5		#' @param DS2 Comparator dataset (data frame)
6		#' @param KEYS List of variables that define a unique row within the datasets (strings)
7		#' @keywords internal
8		identify_extra_rows <- function(DS1, DS2, KEYS) {
9	238x	if (nrow(DS2) == 0 \|\| nrow(DS1) == 0) {
10	10x	return(DS1[, KEYS, drop = FALSE])
11		}
12	228x	DS2[["..FLAG.."]] <- "Y"
13	228x	dat <- merge(
14	228x	subset(DS1, select = KEYS),
15	228x	subset(DS2, select = c(KEYS, "..FLAG..")),
16	228x	by = KEYS, all.x = TRUE,
17	228x	sort = TRUE
18		)
19	228x	dat <- dat[do.call("order", dat[KEYS]), ]
20
21	228x	dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
22		}
23
24
25
26		#' identify_extra_cols
27		#'
28		#' Identifies columns that are in a baseline dataset but not in a comparator dataset
29		#' @param DS1 Baseline dataset (data frame)
30		#' @param DS2 Comparator dataset (data frame)
31		#' @importFrom tibble tibble
32		#' @keywords internal
33		identify_extra_cols <- function(DS1, DS2) {
34	238x	match.cols <- sapply(names(DS1), "%in%", names(DS2))
35	238x	assertthat::assert_that(
36	238x	all(is.logical(match.cols)),
37	238x	msg = "Assumption of logical return type is not true"
38		)
39	238x	tibble(
40	238x	COLUMNS = names(DS1)[!match.cols]
41		)
42		}
43
44
45
46
47
48
49
50		#' identify_matching_cols
51		#'
52		#' Identifies columns with the same name in two data frames
53		#' @param DS1 Input dataset 1 (data frame)
54		#' @param DS2 Input dataset 2 (data frame)
55		#' @param EXCLUDE Columns to ignore
56		#' @keywords internal
57		identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
58	796x	match_cols <- sapply(names(DS1), "%in%", names(DS2))
59	796x	exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
60	796x	names(DS1)[match_cols & !exclude_cols]
61		}
62
63
64
65
66
67		#' identify_unsupported_cols
68		#'
69		#' Identifies any columns for which the package is not setup to handle
70		#' @param dsin input dataset
71		#' @keywords internal
72		identify_unsupported_cols <- function(dsin) {
73	248x	dat <- subset(
74	248x	identify_properties(dsin),
75	248x	select = c("VARIABLE", "MODE")
76		)
77
78	248x	dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]
79		}
80
81
82
83		#' identify_mode_differences
84		#'
85		#' Identifies any mode differences between two data frames
86		#' @param BASE Base dataset for comparison (data.frame)
87		#' @param COMP Comparator dataset to compare base against (data.frame)
88		#' @keywords internal
89		identify_mode_differences <- function(BASE, COMP) {
90	424x	matching_cols <- identify_matching_cols(BASE, COMP)
91
92	424x	dat <- merge(
93	424x	x = identify_properties(BASE),
94	424x	y = identify_properties(COMP),
95	424x	by = "VARIABLE",
96	424x	all = TRUE,
97	424x	suffixes = c(".BASE", ".COMP"),
98	424x	sort = TRUE
99		)
100	424x	dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))
101
102	424x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
103	424x	KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]
104
105	424x	dat[KEEP1 & KEEP2, , drop = FALSE]
106		}
107
108
109
110		#' identify_class_differences
111		#'
112		#' Identifies any class differences between two data frames
113		#' @param BASE Base dataset for comparison (data.frame)
114		#' @param COMP Comparator dataset to compare base against (data.frame)
115		#' @keywords internal
116		identify_class_differences <- function(BASE, COMP) {
117	124x	matching_cols <- identify_matching_cols(BASE, COMP)
118
119	124x	dat <- merge(
120	124x	x = identify_properties(BASE),
121	124x	y = identify_properties(COMP),
122	124x	by = "VARIABLE",
123	124x	all = TRUE,
124	124x	sort = TRUE,
125	124x	suffixes = c(".BASE", ".COMP")
126		)
127
128	124x	dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))
129
130	124x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
131	124x	KEEP2 <- !mapply(
132	124x	identical,
133	124x	dat[["CLASS.BASE"]],
134	124x	dat[["CLASS.COMP"]]
135		)
136
137	124x	dat[KEEP1 & KEEP2, , drop = FALSE]
138		}
139
140
141
142		#' Identify differences in attributes
143		#'
144		#' Identifies any attribute differences between two data frames
145		#' @param BASE Base dataset for comparison (data.frame)
146		#' @param COMP Comparator dataset to compare base against (data.frame)
147		#' @param exclude_cols Columns to exclude from comparison
148		#' @importFrom tibble tibble
149		#' @keywords internal
150		identify_att_differences <- function(BASE, COMP, exclude_cols = "") {
151	129x	matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)
152
153	129x	PROPS <- merge(
154	129x	x = identify_properties(BASE),
155	129x	y = identify_properties(COMP),
156	129x	by = "VARIABLE",
157	129x	all = TRUE,
158	129x	sort = TRUE,
159	129x	suffixes = c(".BASE", ".COMP")
160		)
161
162	129x	PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))
163
164	129x	PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]
165
166
167		### Setup dummy return value
168	129x	RETURN <- tibble(
169	129x	VARIABLE = character(),
170	129x	ATTR_NAME = character(),
171	129x	VALUES.BASE = list(),
172	129x	VALUES.COMP = list()
173		)
174
175	129x	for (i in PROPS[["VARIABLE"]]) {
176	1179x	PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]
177
178		### Get a vector of all available attributes across both variables
179	1179x	ATTRIB_NAMES <- unique(c(
180	1179x	names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
181	1179x	names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
182		))
183
184		### If variable has no attributes move onto the next variable
185	869x	if (is.null(ATTRIB_NAMES)) next()
186
187		### Loop over each attribute checking if they are identical and outputing
188		### anyones that arn't
189	310x	for (j in ATTRIB_NAMES) {
190	532x	ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
191	532x	ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]
192
193	532x	if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {
194	65x	ATT_DIFFS <- tibble(
195	65x	VARIABLE = i,
196	65x	ATTR_NAME = j,
197	65x	VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
198	65x	VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
199		)
200
201	65x	RETURN <- rbind(RETURN, ATT_DIFFS)
202		}
203		}
204		}
205	129x	return(RETURN)
206		}
207
208
209
210
211
212		#' identify_differences
213		#'
214		#' Compares each column within 2 datasets to identify any values which they
215		#' mismatch on.
216		#' @param BASE Base dataset for comparison (data.frame)
217		#' @param COMP Comparator dataset to compare base against (data.frame)
218		#' @param KEYS List of variables that define a unique row within the datasets (strings)
219		#' @param exclude_cols Columns to exclude from comparison
220		#' @param tolerance Level of tolerance for numeric differences between two variables
221		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
222		#' @keywords internal
223		identify_differences <- function(
224		BASE,
225		COMP,
226		KEYS,
227		exclude_cols,
228		tolerance = sqrt(.Machine$double.eps),
229		scale = NULL
230		) {
231
232	119x	matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))
233
234	119x	if (length(matching_cols) == 0) {
235	2x	return(tibble())
236		}
237
238	117x	DAT <- merge(
239	117x	x = BASE,
240	117x	y = COMP,
241	117x	by = KEYS,
242	117x	suffixes = c(".x", ".y"),
243	117x	sort = TRUE
244		)
245	117x	if (nrow(DAT) == 0) {
246	4x	return(tibble())
247		}
248	113x	DAT <- DAT[do.call("order", DAT[KEYS]), ]
249
250	113x	matching_list <- mapply(
251	113x	is_variable_different,
252	113x	matching_cols,
253	113x	MoreArgs = list(
254	113x	keynames = KEYS,
255	113x	datain = DAT,
256	113x	tolerance = tolerance,
257	113x	scale = scale
258		),
259	113x	SIMPLIFY = FALSE
260		)
261
262	113x	matching_list
263		}
264
265
266
267
268
269
270
271
272
273		#' identify_properties
274		#'
275		#' Returns a dataframe of metadata for a given dataset.
276		#' Returned values include variable names , class , mode , type & attributes
277		#' @param dsin input dataframe that you want to get the metadata from
278		#' @importFrom tibble tibble
279		#' @keywords internal
280		identify_properties <- function(dsin) {
281		### If missing or null return empty dataset
282	1602x	if (is.null(dsin)) {
283	!	x <- tibble(
284	!	VARIABLE = character(),
285	!	CLASS = list(),
286	!	MODE = character(),
287	!	TYPE = character(),
288	!	ATTRIBS = list()
289		)
290	!	return(x)
291		}
292
293	1602x	tibble(
294	1602x	VARIABLE = names(dsin),
295	1602x	CLASS = lapply(dsin, class),
296	1602x	MODE = sapply(dsin, mode),
297	1602x	TYPE = sapply(dsin, typeof),
298	1602x	ATTRIBS = lapply(dsin, attributes)
299		)
300		}
301
302
303		#' Find column ordering differences
304		#'
305		#' Compares two datasets and outputs a table listing any differences in the column
306		#' orders between the two datasets. Columns that are not contained within both
307		#' are ignored however column ordering is derived prior to removing these columns.
308		#'
309		#' @param BASE (`data.frame`)\cr Base dataset for comparison
310		#' @param COMP (`data.frame`)\cr Comparator dataset to compare base against
311		#' @keywords internal
312		identify_column_order_differences <- function(BASE, COMP) {
313	8x	base_cols <- tibble(
314	8x	COLUMN = names(BASE),
315	8x	"BASE-INDEX" = seq_along(names(BASE))
316		)
317	8x	comp_cols <- tibble(
318	8x	COLUMN = names(COMP),
319	8x	"COMPARE-INDEX" = seq_along(names(COMP))
320		)
321	8x	col_index <- merge(
322	8x	base_cols,
323	8x	comp_cols,
324	8x	by = c("COLUMN"),
325	8x	all = TRUE,
326	8x	sort = FALSE
327		)
328	8x	keep_rows <- col_index[["BASE-INDEX"]] != col_index[["COMPARE-INDEX"]]
329	8x	keep_rows[is.na(keep_rows)] <- FALSE
330	8x	col_index[keep_rows, , drop = FALSE]
331		}

1		#' factor_to_character
2		#'
3		#' Takes a dataframe and converts any factor variables to character
4		#' @param dsin input dataframe
5		#' @param vars variables to consider for conversion. Default NULL will consider
6		#' every variable within the dataset
7		#' @keywords internal
8		factor_to_character <- function(dsin, vars = NULL) {
9	!	if (is.null(vars)) vars <- names(dsin)
10
11	238x	for (var in vars) {
12	252x	if (is.factor(dsin[[var]])) {
13	!	dsin[[var]] <- as.character(dsin[[var]])
14		}
15		}
16	238x	return(dsin)
17		}
18
19
20
21
22		#' has_unique_rows
23		#'
24		#' Check if a data sets rows are unique
25		#' @param DAT input data set (data frame)
26		#' @param KEYS Set of keys which should be unique
27		#' @keywords internal
28		has_unique_rows <- function(DAT, KEYS) {
29	249x	DUPS <- duplicated(subset(DAT, select = KEYS))
30	249x	NDUPS <- sum(DUPS)
31	249x	return(NDUPS == 0)
32		}
33
34		#' convert_to_issue
35		#'
36		#' converts the count value into the correct issue format
37		#' @param datin data inputted
38		#' @importFrom tibble rownames_to_column
39		#' @keywords internal
40		convert_to_issue <- function(datin) {
41	113x	datin_tibble <- tibble(
42	113x	`Variable` = names(datin),
43	113x	`No of Differences` = datin
44		)
45
46	113x	datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
47	113x	return(datin_tibble_reduced)
48		}
49
50		#' Describe the datasets being compared
51		#'
52		#' This function is used to produce a basic summary table of the core
53		#' features of the two `data.frame`'s being compared.
54		#' @param base (`data.frame`)\cr base dataset to be described
55		#' @param comp (`data.frame`)\cr comparison dataset to be described
56		#' @param base_name (`character`)\cr name of the base dataset
57		#' @param comp_name (`character`)\cr name of the comparison dataset
58		#' @keywords internal
59		describe_dataframe <- function(base, comp, base_name, comp_name) {
60	131x	tibble(
61	131x	PROPERTY = list(
62	131x	"Name",
63	131x	"Class",
64	131x	"Rows(#)",
65	131x	"Columns(#)"
66		),
67	131x	BASE = c(
68	131x	base_name,
69	131x	paste(class(base), collapse = ", "),
70	131x	as.character(nrow(base)),
71	131x	as.character(ncol(base))
72		),
73	131x	COMP = c(
74	131x	comp_name,
75	131x	paste(class(comp), collapse = ", "),
76	131x	as.character(nrow(comp)),
77	131x	as.character(ncol(comp))
78		)
79		)
80		}

1		#' construct_issue
2		#'
3		#' Make an s3 object with class issue and possible additional class,
4		#' and assign other arguments to attributes
5		#' @param value the value of the object
6		#' @param message the value of the message attribute
7		#' @param add_class additional class to add
8		#' @keywords internal
9		construct_issue <- function(value, message, add_class = NULL) {
10	2271x	x <- value
11
12		### If nothing has been provided return nothing !
13	2271x	if (nrow(x) == 0) {
14	1979x	return(NULL)
15		}
16
17	292x	class(x) <- c(add_class, "issue", class(x))
18	292x	attributes(x)[["message"]] <- message
19	292x	return(x)
20		}
21
22
23		#' get_issue_message
24		#'
25		#' Simple function to grab the issue message
26		#' @param object inputted object of class issue
27		#' @param ... other arguments
28		#' @keywords internal
29		get_issue_message <- function(object, ...) {
30	152x	return(attr(object, "message"))
31		}
32
33
34		#' get_print_message
35		#'
36		#' Get the required text depending on type of issue
37		#' @param object inputted object of class issue
38		#' @param ... other arguments
39		#' @keywords internal
40		get_print_message <- function(object, ...) {
41	60x	UseMethod("get_print_message", object)
42		}
43
44
45		#' get_print_message.default
46		#'
47		#' Errors, as this should only ever be given an issue
48		#' @param object issue
49		#' @param ... Not used
50		#' @keywords internal
51		get_print_message.default <- function(object, ...) {
52	!	stop("Error: An issue has not been provided to this function!")
53		}
54
55
56		#' get_print_message.issue
57		#'
58		#' Get text from a basic issue, based on the class of the value of the issue
59		#'
60		#' @param object an object of class issue_basic
61		#' @inheritParams print.diffdf
62		#' @keywords internal
63		get_print_message.issue <- function(object, row_limit, ...) {
64	60x	paste(
65	60x	c(
66	60x	attr(object, "message"),
67	60x	get_table(object, row_limit = row_limit),
68	60x	"\n\n"
69		),
70	60x	collapse = "\n"
71		)
72		}