diffdf coverage - 94.71%

Files
Source

#' Identify Issue Rows
#'
#' This function takes a `diffdf` object and a dataframe and subsets
#' the `data.frame` for problem rows as identified in the comparison object.
#' If \code{vars} has been specified only issue rows associated with those
#' variable(s) will be returned.
#' @param df dataframe to be subsetted
#' @param diff diffdf object
#' @param vars (optional) character vector containing names of issue variables to subset dataframe
#' on. A value of NULL (default) will be taken to mean available issue variables.
#' @examples
#' iris2 <- iris
#' for (i in 1:3) iris2[i, i] <- 99
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
#' diffdf_issuerows(iris, x)
#' diffdf_issuerows(iris2, x)
#' diffdf_issuerows(iris2, x, vars = "Sepal.Length")
#' diffdf_issuerows(iris2, x, vars = c("Sepal.Length", "Sepal.Width"))
#' @details
#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only
#' requirement is that the original variables specified in the keys argument to diffdf
#' are present on the dataframe you are subsetting against. However please note that if
#' no keys were specified in diffdf then the row number is used. This means using
#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in
#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
#' @export
diffdf_issuerows <- function(df, diff, vars = NULL) {

    assertthat::assert_that(
        class(diff)[[1]] == "diffdf"
    )


    KEYS_ATT <- attr(diff, "keys")

    assertthat::assert_that(
        !is.null(KEYS_ATT),
        msg = "diff is missing the keys attribute"
    )


    issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]

    if (is.null(vars)) {
        vars <- issue_vars
    } else {
        vars <- paste0("VarDiff_", vars)
    }

    if (length(issue_vars) == 0 || sum(vars %in% issue_vars) == 0) {
        return(df[FALSE, ])
    }

    KEEP <- mapply(
        FUN      = get_issue_dataset,
        issue    = vars,
        diff     = list(diff),
        SIMPLIFY = FALSE
    )

    KEEP <- recursive_reduce(KEEP, rbind)
    KEEP <- KEEP[!duplicated(KEEP), ]

    if (KEYS_ATT$is_derived) {
        df[[KEYS_ATT$value]] <- seq_len(nrow(df))
    }

    keys <- KEYS_ATT$value

    if (any(!keys %in% names(df))) {
        stop("df does not contain all variables specified as keys in diff")
    }

    RET <- merge(
        x = df,
        y = KEEP,
        sort = TRUE
    )

    RET <- RET[do.call("order", RET[keys]), ]

    if (KEYS_ATT$is_derived) {
        keep_vars <- !names(RET) %in% KEYS_ATT$value
        RET <- RET[, keep_vars, drop = FALSE]
    }

    return(RET)
}




#' get_issue_dataset
#'
#' Internal function used by `diffdf_issuerows` to extract the dataframe
#' from each a target issue. In particular it also strips off any
#' non-key variables
#' @param issue name of issue to extract the dataset from diff
#' @param diff diffdf object which contains issues
#' @keywords internal
get_issue_dataset <- function(issue, diff) {
    issue_df <- diff[[issue]]
    keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
    issue_df[, keep, drop = FALSE]
}

#' diffdf
#' @description
#' Compares 2 dataframes and outputs any differences.
#' @param base input dataframe
#' @param compare comparison dataframe
#' @param keys vector of variables (as strings) that defines a unique row in
#' the base and compare dataframes
#' @param strict_numeric Flag for strict numeric to numeric comparisons
#' (default = TRUE). If False diffdf will cast integer to double where
#' required for comparisons. Note that variables specified in the keys
#' will never be casted.
#' @param strict_factor Flag for strict factor to character comparisons
#' (default = TRUE). If False diffdf will cast factors to characters where
#' required for comparisons. Note that variables specified in the keys will
#' never be casted.
#' @param suppress_warnings Do you want to suppress warnings? (logical)
#' @param file Location and name of a text file to output the results to.
#' Setting to NULL will cause no file to be produced.
#' @param tolerance Set tolerance for numeric comparisons. Note that
#' comparisons fail if (x-y)/scale > tolerance.
#' @param scale Set scale for numeric comparisons. Note that comparisons fail
#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
#' version of scale = 1.
#' @param check_column_order Should the column ordering be checked? (logical)
#' @param check_df_class Do you want to check for differences in the class
#' between `base` and `compare`? (logical)
#' @examples
#' x <- subset(iris, -Species)
#' x[1, 2] <- 5
#' COMPARE <- diffdf(iris, x)
#' print(COMPARE)
#'
#' #### Sample data frames
#'
#' DF1 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, 3, NA)
#' )
#'
#' DF2 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 7),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, NA, NA),
#'     v3 = c(NA, NA, 1, 2, NA, 4)
#' )
#'
#' diffdf(DF1, DF1, keys = "id")
#'
#' # We can control matching with scale/location for example:
#'
#' DF1 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(1, 2, 3, 4, 5, 6)
#' )
#' DF2 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(1.1, 2, 3, 4, 5, 6)
#' )
#'
#' diffdf(DF1, DF2, keys = "id")
#' diffdf(DF1, DF2, keys = "id", tolerance = 0.2)
#' diffdf(DF1, DF2, keys = "id", scale = 10, tolerance = 0.2)
#'
#' # We can use strict_factor to compare factors with characters for example:
#'
#' DF1 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, 3, NA),
#'     stringsAsFactors = FALSE
#' )
#'
#' DF2 <- data.frame(
#'     id = c(1, 2, 3, 4, 5, 6),
#'     v1 = letters[1:6],
#'     v2 = c(NA, NA, 1, 2, 3, NA)
#' )
#'
#' diffdf(DF1, DF2, keys = "id", strict_factor = TRUE)
#' diffdf(DF1, DF2, keys = "id", strict_factor = FALSE)
#'
#' @export
diffdf <- function(
    base,
    compare,
    keys = NULL,
    suppress_warnings = FALSE,
    strict_numeric = TRUE,
    strict_factor = TRUE,
    file = NULL,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL,
    check_column_order = FALSE,
    check_df_class = FALSE
) {

    assertthat::assert_that(
        assertthat::is.flag(check_df_class),
        !is.na(check_df_class),
        msg = "`check_df_class` must be a length 1 logical"
    )

    BASE <- base
    COMP <- compare
    KEYS <- keys
    SUPWARN <- suppress_warnings

    ### Initatiate output object
    COMPARE <- list()
    class(COMPARE) <- c("diffdf", "list")


    BASE_NAME <- deparse(substitute(base))
    COMP_NAME <- deparse(substitute(compare))
    COMPARE[["DataSummary"]] <- construct_issue(
        value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
        message = "Summary of BASE and COMPARE"
    )


    is_derived <- FALSE

    ### If no key is suplied match values based upon row number
    if (is.null(KEYS)) {
        is_derived <- TRUE
        keyname <- generate_keyname(BASE, COMP)
        BASE[[keyname]] <- seq_len(nrow(BASE))
        COMP[[keyname]] <- seq_len(nrow(COMP))
        KEYS <- keyname
    }
    attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)

    assertthat::assert_that(
        is.numeric(tolerance),
        is.numeric(scale) || is.null(scale)
    )

    missing_keys_base <- KEYS[!KEYS %in% names(BASE)]
    assertthat::assert_that(
        length(missing_keys_base) == 0,
        msg = sprintf(
            "The following KEYS are not available in BASE:\n   %s",
            paste(missing_keys_base, collapse = "\n   ")
        )
    )

    missing_keys_comp <- KEYS[!KEYS %in% names(COMP)]
    assertthat::assert_that(
        length(missing_keys_comp) == 0,
        msg = sprintf(
            "The following KEYS are not available in COMPARE:\n   %s",
            paste(missing_keys_comp, collapse = "\n   ")
        )
    )

    assertthat::assert_that(
        has_unique_rows(BASE, KEYS),
        msg = "BY variables in BASE do not result in unique observations"
    )

    assertthat::assert_that(
        has_unique_rows(COMP, KEYS),
        msg = "BY variables in COMPARE do not result in unique observations"
    )



    #### Check essential variable properties (class & mode)

    COMPARE[["UnsupportedColsBase"]] <- construct_issue(
        value = identify_unsupported_cols(BASE),
        message = "There are columns in BASE with unsupported modes !!"
    )


    COMPARE[["UnsupportedColsComp"]] <- construct_issue(
        value = identify_unsupported_cols(COMP),
        message = "There are columns in COMPARE with unsupported modes !!"
    )


    # cast variables if strict is off
    if (!strict_factor || !strict_numeric) {
        casted_df <- cast_variables(
            BASE = BASE,
            COMPARE = COMP,
            ignore_vars = KEYS,
            cast_integers = !strict_numeric,
            cast_factors = !strict_factor
        )

        BASE <- casted_df$BASE
        COMP <- casted_df$COMP
    }


    COMPARE[["VarModeDiffs"]] <- construct_issue(
        value = identify_mode_differences(BASE, COMP),
        message = "There are columns in BASE and COMPARE with different modes !!"
    )


    COMPARE[["VarClassDiffs"]] <- construct_issue(
        value = identify_class_differences(BASE, COMP),
        message = "There are columns in BASE and COMPARE with different classes !!"
    )




    ##### Check Validity of Keys

    BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
    COMP_keys <- names(COMP)[names(COMP) %in% KEYS]

    assertthat::assert_that(
        length(BASE_keys) == length(KEYS),
        msg = "BASE is missing variables specified in KEYS"
    )

    assertthat::assert_that(
        length(COMP_keys) == length(KEYS),
        msg = "COMP is missing variables specified in KEYS"
    )


    assert_valid_keys(
        COMPARE, KEYS, "UnsupportedColsBase",
        "The following KEYS in BASE have an unsupported mode (see `?mode()`)"
    )
    assert_valid_keys(
        COMPARE, KEYS, "UnsupportedColsComp",
        "The following KEYS in COMPARE have an unsupported mode (see `?mode()`)"
    )
    assert_valid_keys(
        COMPARE, KEYS, "VarModeDiffs",
        "The following KEYS have different modes between BASE and COMPARE"
    )
    assert_valid_keys(
        COMPARE, KEYS, "VarClassDiffs",
        "The following KEYS have different classes between BASE and COMPARE"
    )


    exclude_cols <- c(
        COMPARE[["UnsupportedColsBase"]]$VARIABLE,
        COMPARE[["UnsupportedColsComp"]]$VARIABLE,
        COMPARE[["VarClassDiffs"]]$VARIABLE,
        COMPARE[["VarModeDiffs"]]$VARIABLE
    )

    if (check_column_order) {
        if (attr(COMPARE, "keys")$is_derived) {
            keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
            keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
        } else {
            keep_vars_base <- TRUE
            keep_vars_comp <- TRUE
        }
        COMPARE[["ColumnOrder"]] <- construct_issue(
            value = identify_column_order_differences(
                BASE[, keep_vars_base, drop = FALSE],
                COMP[, keep_vars_comp, drop = FALSE]
            ),
            message = "There are differences in the column ordering between BASE and COMPARE !!"
        )
    }


    ##### Check Attributes
    COMPARE[["AttribDiffs"]] <- construct_issue(
        value = identify_att_differences(BASE, COMP, exclude_cols),
        message = "There are columns in BASE and COMPARE with differing attributes !!"
    )


    ##### Check data

    BASE <- factor_to_character(BASE, KEYS)
    COMP <- factor_to_character(COMP, KEYS)


    COMPARE[["ExtRowsBase"]] <- construct_issue(
        value = identify_extra_rows(BASE, COMP, KEYS),
        message = "There are rows in BASE that are not in COMPARE !!"
    )


    COMPARE[["ExtRowsComp"]] <- construct_issue(
        value = identify_extra_rows(COMP, BASE, KEYS),
        message = "There are rows in COMPARE that are not in BASE !!"
    )


    COMPARE[["ExtColsBase"]] <- construct_issue(
        value = identify_extra_cols(BASE, COMP),
        message = "There are columns in BASE that are not in COMPARE !!"
    )


    COMPARE[["ExtColsComp"]] <- construct_issue(
        value = identify_extra_cols(COMP, BASE),
        message = "There are columns in COMPARE that are not in BASE !!"
    )


    VALUE_DIFFERENCES <- identify_differences(
        BASE, COMP, KEYS, exclude_cols,
        tolerance = tolerance,
        scale = scale
    )



    ## Summarise the number of mismatching rows per variable

    if (length(VALUE_DIFFERENCES)) {
        NDIFF <- sapply(VALUE_DIFFERENCES, nrow)
        COMPARE[["NumDiff"]] <- construct_issue(
            value = convert_to_issue(NDIFF),
            message = "Not all Values Compared Equal"
        )
    }


    for (i in names(VALUE_DIFFERENCES)) {
        COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
            value = VALUE_DIFFERENCES[[i]],
            message = NULL
        )
    }


    # suppress warning message of data summary if user didn't request to check it
    # we leave the issue in the main compare object though for printing purposes
    COMPARE_WARNINGS <- COMPARE
    attr(COMPARE_WARNINGS[["DataSummary"]], "message") <- c(
        "There are differences between the class of BASE and COMPARE"
    )
    if (!check_df_class || identical(class(base), class(compare))) {
        COMPARE_WARNINGS["DataSummary"] <- NULL
    }

    # Get all issue messages, remove blank message, and collapse into single string
    ISSUE_MSGS <- sapply(COMPARE_WARNINGS, function(x) get_issue_message(x))
    ISSUE_MSGS <- Filter(function(x) !is.null(x), ISSUE_MSGS)
    ISSUE_MSGS <- Filter(function(x) x != "", ISSUE_MSGS)

    if (length(ISSUE_MSGS) != 0) {
        if (!SUPWARN) {
            ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
            warning(c("\n", ISSUE_MSGS))
        }
    }

    # If the classes are the same and it is the only entry in the compare
    # object then remove it in order to trigger "no issues found"
    if (identical(class(base), class(compare)) && length(COMPARE) == 1) {
        COMPARE["DataSummary"] <- NULL
    }

    # If the summary is the only item and the user didn't want to check classes
    # then remove the object to trigger the "no issues found"
    if (!check_df_class && length(COMPARE) == 1) {
        COMPARE["DataSummary"] <- NULL
    }


    if (!is.null(file)) {
        x <- print(COMPARE, as_string = TRUE)

        tryCatch(
            {
                sink(file)
                cat(x, sep = "\n")
                sink()
            },
            warning = function(w) {
                sink()
                warning(w)
            },
            error = function(e) {
                sink()
                stop(e)
            }
        )
        return(invisible(COMPARE))
    }

    return(COMPARE)
}




#' diffdf_has_issues
#'
#' Utility function which returns TRUE if an diffdf
#' object has issues or FALSE if an diffdf object does not have issues
#' @param x diffdf object
#' @examples
#'
#' # Example with no issues
#' x <- diffdf(iris, iris)
#' diffdf_has_issues(x)
#'
#' # Example with issues
#' iris2 <- iris
#' iris2[2, 2] <- NA
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
#' diffdf_has_issues(x)
#' @export
diffdf_has_issues <- function(x) {
    if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
    return(length(x) != 0)
}


#' Assert that keys are valid
#'
#' Utility function to check that user provided "keys" aren't listed as a problem
#' variable of the current list of issues.
#' @param COMPARE (`list`)\cr A named list of which each element is a `data.frame` with the
#' column `VARIABLE`
#' @param KEYS (`character`)\cr name of key variables to check to make sure they don't contain
#' any issues
#' @param component (`character`)\cr name of the component within `COMPARE` to check against
#' @param msg (`character`)\cr error message to print if any of `KEYS` are found within
#' `COMPARE[component]$VARIABLE`
#' @keywords internal
assert_valid_keys <- function(COMPARE, KEYS, component, msg) {
    keys_reduced <- KEYS[KEYS %in% COMPARE[[component]]$VARIABLE]
    assertthat::assert_that(
        length(keys_reduced) == 0,
        msg = sprintf(
            "%s:\n%s",
            msg,
            paste0("`", paste0(keys_reduced, collapse = "`, `"), "`")
        )
    )
}

#' factor_to_character
#'
#' Takes a dataframe and converts any factor variables to character
#' @param dsin input dataframe
#' @param vars variables to consider for conversion. Default NULL will consider
#' every variable within the dataset
#' @keywords internal
factor_to_character <- function(dsin, vars = NULL) {
    if (is.null(vars)) vars <- names(dsin)

    for (var in vars) {
        if (is.factor(dsin[[var]])) {
            dsin[[var]] <- as.character(dsin[[var]])
        }
    }
    return(dsin)
}




#' has_unique_rows
#'
#' Check if a data sets rows are unique
#' @param DAT input data set (data frame)
#' @param KEYS Set of keys which should be unique
#' @keywords internal
has_unique_rows <- function(DAT, KEYS) {
    DUPS <- duplicated(subset(DAT, select = KEYS))
    NDUPS <- sum(DUPS)
    return(NDUPS == 0)
}

#' convert_to_issue
#'
#' converts the count value into the correct issue format
#' @param datin data inputted
#' @importFrom tibble rownames_to_column
#' @keywords internal
convert_to_issue <- function(datin) {
    datin_tibble <- tibble(
        `Variable` = names(datin),
        `No of Differences` = datin
    )

    datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
    return(datin_tibble_reduced)
}

#' Describe the datasets being compared
#'
#' This function is used to produce a basic summary table of the core
#' features of the two `data.frame`'s being compared.
#' @param base (`data.frame`)\cr base dataset to be described
#' @param comp (`data.frame`)\cr comparison dataset to be described
#' @param base_name (`character`)\cr name of the base dataset
#' @param comp_name (`character`)\cr name of the comparison dataset
#' @keywords internal
describe_dataframe <- function(base, comp, base_name, comp_name) {
    tibble(
        PROPERTY = list(
            "Name",
            "Class",
            "Rows(#)",
            "Columns(#)"
        ),
        BASE = c(
            base_name,
            paste(class(base), collapse = ", "),
            as.character(nrow(base)),
            as.character(ncol(base))
        ),
        COMP = c(
            comp_name,
            paste(class(comp), collapse = ", "),
            as.character(nrow(comp)),
            as.character(ncol(comp))
        )
    )
}

#' identify_extra_rows
#'
#' Identifies rows that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @keywords internal
identify_extra_rows <- function(DS1, DS2, KEYS) {
    if (nrow(DS2) == 0 || nrow(DS1) == 0) {
        return(DS1[, KEYS, drop = FALSE])
    }
    DS2[["..FLAG.."]] <- "Y"
    dat <- merge(
        subset(DS1, select = KEYS),
        subset(DS2, select = c(KEYS, "..FLAG..")),
        by = KEYS, all.x = TRUE,
        sort = TRUE
    )
    dat <- dat[do.call("order", dat[KEYS]), ]

    dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
}



#' identify_extra_cols
#'
#' Identifies columns that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @importFrom tibble tibble
#' @keywords internal
identify_extra_cols <- function(DS1, DS2) {
    match.cols <- sapply(names(DS1), "%in%", names(DS2))
    assertthat::assert_that(
        all(is.logical(match.cols)),
        msg = "Assumption of logical return type is not true"
    )
    tibble(
        COLUMNS = names(DS1)[!match.cols]
    )
}







#' identify_matching_cols
#'
#' Identifies columns with the same name in two data frames
#' @param DS1 Input dataset 1 (data frame)
#' @param DS2 Input dataset 2 (data frame)
#' @param EXCLUDE Columns to ignore
#' @keywords internal
identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
    match_cols <- sapply(names(DS1), "%in%", names(DS2))
    exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
    names(DS1)[match_cols & !exclude_cols]
}





#' identify_unsupported_cols
#'
#' Identifies any columns for which the package is not setup to handle
#' @param dsin input dataset
#' @keywords internal
identify_unsupported_cols <- function(dsin) {
    dat <- subset(
        identify_properties(dsin),
        select = c("VARIABLE", "MODE")
    )

    dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]
}



#' identify_mode_differences
#'
#' Identifies any mode differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @keywords internal
identify_mode_differences <- function(BASE, COMP) {
    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        suffixes = c(".BASE", ".COMP"),
        sort = TRUE
    )
    dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]

    dat[KEEP1 & KEEP2, , drop = FALSE]
}



#' identify_class_differences
#'
#' Identifies any class differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @keywords internal
identify_class_differences <- function(BASE, COMP) {
    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )

    dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- !mapply(
        identical,
        dat[["CLASS.BASE"]],
        dat[["CLASS.COMP"]]
    )

    dat[KEEP1 & KEEP2, , drop = FALSE]
}



#' Identify differences in attributes
#'
#' Identifies any attribute differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param exclude_cols Columns to exclude from comparison
#' @importFrom tibble tibble
#' @keywords internal
identify_att_differences <- function(BASE, COMP, exclude_cols = "") {
    matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)

    PROPS <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )

    PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))

    PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]


    ### Setup dummy return value
    RETURN <- tibble(
        VARIABLE = character(),
        ATTR_NAME = character(),
        VALUES.BASE = list(),
        VALUES.COMP = list()
    )

    for (i in PROPS[["VARIABLE"]]) {
        PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]

        ### Get a vector of all available attributes across both variables
        ATTRIB_NAMES <- unique(c(
            names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
            names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
        ))

        ### If variable has no attributes move onto the next variable
        if (is.null(ATTRIB_NAMES)) next()

        ### Loop over each attribute checking if they are identical and outputing
        ### anyones that arn't
        for (j in ATTRIB_NAMES) {
            ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
            ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]

            if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {
                ATT_DIFFS <- tibble(
                    VARIABLE = i,
                    ATTR_NAME = j,
                    VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
                    VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
                )

                RETURN <- rbind(RETURN, ATT_DIFFS)
            }
        }
    }
    return(RETURN)
}





#' identify_differences
#'
#' Compares each column within 2 datasets to identify any values which they
#' mismatch on.
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @param exclude_cols Columns to exclude from comparison
#' @param tolerance Level of tolerance for numeric differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
#' @keywords internal
identify_differences <- function(
    BASE,
    COMP,
    KEYS,
    exclude_cols,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {

    matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))

    if (length(matching_cols) == 0) {
        return(tibble())
    }

    DAT <- merge(
        x = BASE,
        y = COMP,
        by = KEYS,
        suffix = c(".x", ".y"),
        sort = TRUE
    )
    if (nrow(DAT) == 0) {
        return(tibble())
    }
    DAT <- DAT[do.call("order", DAT[KEYS]), ]

    matching_list <- mapply(
        is_variable_different,
        matching_cols,
        MoreArgs = list(
            keynames = KEYS,
            datain = DAT,
            tolerance = tolerance,
            scale = scale
        ),
        SIMPLIFY = FALSE
    )

    matching_list
}









#' identify_properties
#'
#' Returns a dataframe of metadata for a given dataset.
#' Returned values include variable names , class , mode , type & attributes
#' @param dsin input dataframe that you want to get the metadata from
#' @importFrom tibble tibble
#' @keywords internal
identify_properties <- function(dsin) {
    ### If missing or null return empty dataset
    if (is.null(dsin)) {
        x <- tibble(
            VARIABLE = character(),
            CLASS = list(),
            MODE = character(),
            TYPE = character(),
            ATTRIBS = list()
        )
        return(x)
    }

    tibble(
        VARIABLE = names(dsin),
        CLASS = lapply(dsin, class),
        MODE = sapply(dsin, mode),
        TYPE = sapply(dsin, typeof),
        ATTRIBS = lapply(dsin, attributes)
    )
}


#' Find column ordering differences
#'
#' Compares two datasets and outputs a table listing any differences in the column
#' orders between the two datasets. Columns that are not contained within both
#' are ignored however column ordering is derived prior to removing these columns.
#'
#' @param BASE (`data.frame`)\cr Base dataset for comparison
#' @param COMP (`data.frame`)\cr Comparator dataset to compare base against
#' @keywords internal
identify_column_order_differences <- function(BASE, COMP) {
    base_cols <- tibble(
        COLUMN = names(BASE),
        "BASE-INDEX" = seq_along(names(BASE))
    )
    comp_cols <- tibble(
        COLUMN = names(COMP),
        "COMPARE-INDEX" = seq_along(names(COMP))
    )
    col_index <- merge(
        base_cols,
        comp_cols,
        by = c("COLUMN"),
        all = TRUE,
        sort = FALSE
    )
    keep_rows <- col_index[["BASE-INDEX"]] != col_index[["COMPARE-INDEX"]]
    keep_rows[is.na(keep_rows)] <- FALSE
    col_index[keep_rows, , drop = FALSE]
}

#' sort_then_join
#'
#' Convenience function to sort two strings and paste them together
#' @param string1 first string
#' @param string2 second string
#' @keywords internal
sort_then_join <- function(string1, string2) {
    paste0(sort(c(string1, string2)), collapse = "")
}


#' class_merge
#'
#' Convenience function to put all classes an object has into one string
#' @param x an object
#' @keywords internal
class_merge <- function(x) {
    paste(class(x), collapse = "_")
}


get_message <- function(colname, whichdat, totype) {
    message(paste0(
        "NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype
    ))
}


#' get_casted_vector
#'
#' casts a vector depending on its type and input
#' @param colin column to cast
#' @param colname name of vector
#' @param whichdat whether base or compare is being casted (used for messages)
#' @importFrom methods is
#' @keywords internal
get_casted_vector <- function(colin, colname, whichdat) {
    if (is(colin, "factor")) {
        get_message(colname, whichdat, "character")
        return(as.character(colin))
    }

    if (is(colin, "integer")) {
        get_message(colname, whichdat, "numeric")
        return(as.numeric(colin))
    }

    colin
}



#' get_casted_dataset
#'
#' Internal utility function to loop across a dataset casting all target
#' variables
#' @param df dataset to be casted
#' @param columns columns to be casted
#' @param whichdat whether base or compare is being casted (used for messages)
#' @keywords internal
get_casted_dataset <- function(df, columns, whichdat) {
    for (col in columns) {
        df[[col]] <- get_casted_vector(df[[col]], col, whichdat)
    }
    return(df)
}






#' cast_variables
#'
#' Function to cast datasets columns if they have differing types
#' Restricted to specific cases, currently integer and double, and character and factor
#' @param BASE base dataset
#' @param COMPARE comparison dataset
#' @param ignore_vars Variables not to be considered for casting
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
#' @keywords internal
cast_variables <- function(
    BASE,
    COMPARE,
    ignore_vars = NULL,
    cast_integers = FALSE,
    cast_factors = FALSE
) {

    allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]

    BASE_class <- data.frame(
        class_BASE = sapply(BASE, class_merge),
        colname = names(BASE),
        stringsAsFactors = FALSE
    )
    BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE]


    COMPARE_class <- data.frame(
        class_COMPARE = sapply(COMPARE, class_merge),
        colname = names(COMPARE),
        stringsAsFactors = FALSE
    )
    COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE]

    common_class <- merge(
        x = BASE_class,
        y = COMPARE_class,
        by = "colname"
    )


    diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE]


    diff_class$classmerge <- mapply(
        sort_then_join,
        diff_class$class_COMPARE,
        diff_class$class_BASE
    )


    cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE]


    DATASETS <- list(
        "BASE" = BASE,
        "COMPARE" = COMPARE
    )

    if (nrow(cast_columns) == 0) {
        return(DATASETS)
    }


    for (i in names(DATASETS)) {
        DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i)
    }

    return(DATASETS)
}

#' Pad String
#'
#' Utility function used to replicate `str_pad`. Adds white space to either end
#' of a string to get it to equal the desired length
#' @param x string
#' @param width desired length
#' @keywords internal
string_pad <- function(x, width) {
    if (nchar(x) >= width) {
        return(x)
    }
    width <- width - nchar(x)
    left <- paste0(rep(" ", floor(width / 2)), collapse = "")
    right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
    paste0(left, x, right, collapse = "")
}


#' recursive_reduce
#'
#' Utility function used to replicated `purrr::reduce`. Recursively applies a
#' function to a list of elements until only 1 element remains
#' @param .l list of values to apply a function to
#' @param .f function to apply to each each element of the list in turn. See details.
#' @details
#' This function is essentially performing the following operation:
#' ```
#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
#' ```
#' @keywords internal
recursive_reduce <- function(.l, .f) {
    if (length(.l) != 1) {
        .l[[2]] <- .f(.l[[1]], .l[[2]])
        return(recursive_reduce(.l[-1], .f))
    } else {
        return(.l[[1]])
    }
}

#' invert
#'
#' Utility function used to replicated `purrr::transpose`. Turns a list inside
#' out.
#' @param x list
#' @keywords internal
invert <- function(x) {
    x2 <- list()
    cnames <- names(x)
    tnames <- names(x[[1]])
    for (i in tnames) {
        x2[[i]] <- list()
        for (j in cnames) {
            x2[[i]][[j]] <- x[[j]][[i]]
        }
    }
    return(x2)
}




#' as_ascii_table
#'
#' This function takes a `data.frame` and attempts to convert it into
#' a simple ascii format suitable for printing to the screen
#' It is assumed all variable values have a `as.character()` method
#' in order to cast them to character.
#' @param dat Input dataset to convert into a ascii table
#' @param line_prefix Symbols to prefix in front of every line of the table
#' @keywords internal
as_ascii_table <- function(dat, line_prefix = "  ") {
    n_col <- ncol(dat)
    n_row <- nrow(dat)

    ## Convert every value to character and crop to a suitable length
    dat_char <- lapply(dat, as_fmt_char)


    hold <- list()
    COLS <- colnames(dat)

    ### For each column extract core elements (width, values , title) and pad out
    ### each string to be a suitable length
    for (i in seq_len(n_col)) {
        COL <- COLS[i]
        VALUES <- dat_char[[i]]

        JOINT <- c(COL, VALUES)
        WIDTH <- max(sapply(JOINT, nchar)) + 2

        hold[[COL]] <- list()
        hold[[COL]]$WIDTH <- WIDTH
        hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH)
        hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH)
    }

    ### Collapse into a single value per component ( title , values, width )
    thold <- invert(hold)
    tvals <- recursive_reduce(thold$VALUES, paste0)
    thead <- recursive_reduce(thold$HEADER, paste0)
    twidth <- recursive_reduce(thold$WIDTH, sum)

    ### Create header and footer lines
    TLINE <- paste0(rep("=", twidth), collapse = "")
    LINE <- paste0(rep("-", twidth), collapse = "")
    FVALS <- paste0(line_prefix, tvals, collapse = "\n")

    ### Output table
    paste0(
        line_prefix, TLINE, "\n",
        line_prefix, thead, "\n",
        line_prefix, LINE, "\n",
        FVALS, "\n",
        line_prefix, LINE
    )
}


#' as_character
#'
#' Stub function to enable mocking in unit tests
as_character <- as.character

#' Format vector to printable string
#'
#' Coerces a vector of any type into a printable string. The most
#' significant transformation is performed on existing character
#' vectors which will be truncated, have newlines converted
#' to explicit symbols and will be wrapped in quotes if they
#' contain white space.
#'
#' @param x (`vector`) \cr vector to be converted to character
#' @param add_quotes (`logical`) \cr if true will wrap strings that contain
#' whitespace with quotes
#' @param crop_at (`numeric`) \cr specifies the limit at which strings should
#' be truncated to
#' @param ... additional arguments (not currently used)
#'
#' @name as_fmt_char
#' @keywords internal
as_fmt_char <- function(x, ...) {
    UseMethod("as_fmt_char")
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.numeric <- function(x, ...) {
    format(x, digits = 7, justify = "right")
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.NULL <- function(x, ...) {
    "<NULL>"
}

#' @importFrom utils capture.output
#' @rdname as_fmt_char
#' @export
as_fmt_char.list <- function(x, ...) {
    vapply(
        x,
        function(x) {
            if (is.numeric(x)) {
                return(as_fmt_char(x))
            }
            if (is.character(x) & length(x) == 1) {
                return(as_fmt_char(x))
            }
            as_fmt_char(
                paste(capture.output(dput(x)), collapse = " "),
                add_quotes = FALSE
            )
        },
        character(1)
    )
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.factor <- function(x, ...) {
    as_fmt_char(as.character(x))
}

#' @rdname as_fmt_char
#' @export
as_fmt_char.character <- function(x, add_quotes = TRUE, crop_at = 30, ...) {
    needs_quotes <- grepl("\\s", x) & add_quotes

    x[is.na(x)] <- "<NA>"

    # Replace \nl \cr with tags to stop print message splitting over
    # multiple lines
    x <- gsub("\x0D", "<cr>", x)
    x <- gsub("\x0A", "<nl>", x)

    charlength <- vapply(x, nchar, numeric(1))
    x <- substr(x, 1, crop_at)
    x[charlength > crop_at] <- paste0(x[charlength > crop_at], "...")

    # Add enclosing " " around strings with white space so that it can be
    # clearly identified in the printed output
    x[needs_quotes] <- paste0('"', x[needs_quotes], '"')

    return(x)
}


#' @rdname as_fmt_char
#' @export
as_fmt_char.default <- function(x, ...) {
    x_char <- as_character(x)
    assertthat::assert_that(
        is.character(x_char),
        msg = sprintf(
            "Unable to convert class `'%s'` to character for printing purposes",
            paste(class(x), collapse = "', '")
        )
    )
    as_fmt_char.character(x_char, add_quotes = FALSE)
}


#' @rdname as_fmt_char
#' @export
as_fmt_char.POSIXt <- function(x, ...) {
    format(x, "%Y-%m-%d %H:%M:%S %Z")
}


#' get_table
#'
#' Generate nice looking table from a data frame
#' @param dsin dataset
#' @inheritParams print.diffdf
#' @keywords internal
get_table <- function(dsin, row_limit = 10) {
    if (nrow(dsin) == 0) {
        return("")
    }
    if (!is.null(row_limit)) {
        assertthat::assert_that(
            assertthat::is.number(row_limit),
            row_limit > 0,
            msg = "row_limit must be a positive integer"
        )
    }
    if (is.null(row_limit)) {
        display_table <- dsin
    } else {
        display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1))
    }

    add_message <- if (!is.null(row_limit) && nrow(dsin) > row_limit) {
        paste0(
            "First ",
            row_limit,
            " of ",
            nrow(dsin),
            " rows are shown in table below"
        )
    } else {
        NULL
    }

    msg <- paste(
        c(
            add_message,
            as_ascii_table(display_table)
        ),
        collapse = "\n"
    )
    return(msg)
}

#' is_variable_different
#'
#' This subsets the data set on the variable name, picks out differences and returns a `tibble`
#' of differences for the given variable
#' @importFrom tibble as_tibble
#' @param variablename name of variable being compared
#' @param keynames name of keys
#' @param datain Inputted dataset with base and compare vectors
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @return A boolean vector which is T if target and current are different
#' @keywords internal
is_variable_different <- function(variablename, keynames, datain, ...) {
    xvar <- paste0(variablename, ".x")
    yvar <- paste0(variablename, ".y")

    assertthat::assert_that(
        xvar %in% names(datain) && yvar %in% names(datain),
        msg = "Variable does not exist within input dataset"
    )

    target <- datain[[xvar]]
    current <- datain[[yvar]]
    outvect <- find_difference(target, current, ...)

    datain[["VARIABLE"]] <- variablename

    names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")

    x <- as_tibble(
        subset(
            datain,
            outvect,
            select = c("VARIABLE", keynames, "BASE", "COMPARE")
        )
    )

    return(x)
}

#' compare_vectors
#'
#' Compare two vectors looking for differences
#'
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
compare_vectors <- function(target, current, ...) {
    UseMethod("compare_vectors")
}


#' find_difference
#'
#' This determines if two vectors are different. It expects vectors of the same
#' length and type, and is intended to be used after checks have already been done
#' Initially picks out any `NA`'s (matching `NA`'s count as a match)
#' Then compares remaining vector
#'
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
find_difference <- function(target, current, ...) {
    if (length(target) != length(current)) {
        warning("Inputs are not of the same length")
        return(NULL)
    }

    if (is.null(target) || is.null(current)) {
        return(is.null(target) != is.null(current))
    }

    ### Initalise output, assume problem unless evidence otherwise
    return_vector <- rep(TRUE, length(target))

    nas_t <- is.na(target)
    nas_c <- is.na(current)

    ## compare missing values
    nacompare <- nas_t != nas_c
    naselect <- nas_t | nas_c
    return_vector[naselect] <- nacompare[naselect]

    ## compare non-missing values
    selectvector <- as.logical((!nas_t) * (!nas_c))

    comparevect <- compare_vectors(
        target[selectvector],
        current[selectvector],
        ...
    )

    return_vector[selectvector] <- comparevect

    return(return_vector)
}







#' compare_vectors.default
#'
#' Default method, if the vector is not numeric or factor. Basic comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
compare_vectors.default <- function(target, current, ...) {
    target != current
}




#' compare_vectors.factor
#'
#' Compares factors. Sets them as character and then compares
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @keywords internal
compare_vectors.factor <- function(target, current, ...) {
    as.character(target) != as.character(current)
}





#' compare_vectors.numeric
#'
#' This is a modified version of the all.equal function
#' which returns a vector rather than a message
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
#' @param ... Not used
#' @keywords internal
compare_vectors.numeric <- function(
    target,
    current,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL,
    ...
) {

    out <- target == current

    if (all(out)) {
        return(!out)
    }

    if (is.integer(target) || is.integer(current)) {
        target <- as.double(target)
        current <- as.double(current)
    }

    xy <- abs(target - current)

    if (!is.null(scale)) {
        xy <- xy / scale
    }

    return(xy > tolerance)
}

#' compare_vectors.int64
#'
#' Handle int64 vectors. Uses numeric comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
#' @param ... Not used
#' @keywords internal
compare_vectors.integer64 <- function(
    target,
    current,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL,
    ...
) {
    compare_vectors.numeric(target, current, tolerance, scale)
}

#' construct_issue
#'
#' Make an s3 object with class issue and possible additional class,
#' and assign other arguments to attributes
#' @param value the value of the object
#' @param message the value of the message attribute
#' @param add_class additional class to add
#' @keywords internal
construct_issue <- function(value, message, add_class = NULL) {
    x <- value

    ### If nothing has been provided return nothing !
    if (nrow(x) == 0) {
        return(NULL)
    }

    class(x) <- c(add_class, "issue", class(x))
    attributes(x)[["message"]] <- message
    return(x)
}


#' get_issue_message
#'
#' Simple function to grab the issue message
#' @param object inputted object of class issue
#' @param ... other arguments
#' @keywords internal
get_issue_message <- function(object, ...) {
    return(attr(object, "message"))
}


#' get_print_message
#'
#' Get the required text depending on type of issue
#' @param object inputted object of class issue
#' @param ... other arguments
#' @keywords internal
get_print_message <- function(object, ...) {
    UseMethod("get_print_message", object)
}


#' get_print_message.default
#'
#' Errors, as this should only ever be given an issue
#' @param object issue
#' @param ... Not used
#' @keywords internal
get_print_message.default <- function(object, ...) {
    stop("Error: An issue has not been provided to this function!")
}


#' get_print_message.issue
#'
#' Get text from a basic issue, based on the class of the value of the issue
#'
#' @param object an object of class issue_basic
#' @inheritParams print.diffdf
#' @keywords internal
get_print_message.issue <- function(object, row_limit, ...) {
    paste(
        c(
            attr(object, "message"),
            get_table(object, row_limit = row_limit),
            "\n\n"
        ),
        collapse = "\n"
    )
}

#' Print diffdf objects
#'
#' Print nicely formatted version of an diffdf object
#' @param x comparison object created by diffdf().
#' @param ... Additional arguments (not used)
#' @param row_limit Max row limit for difference tables (NULL to show all rows)
#' @param as_string Return printed message as an R character vector?
#' @examples
#' x <- subset(iris, -Species)
#' x[1, 2] <- 5
#' COMPARE <- diffdf(iris, x)
#' print(COMPARE)
#' print(COMPARE, row_limit = 5)
#' @export
print.diffdf <- function(x, row_limit = 10, as_string = FALSE, ...) {
    if (!is.null(row_limit)) {
        assertthat::assert_that(
            assertthat::is.number(row_limit),
            row_limit > 0,
            msg = "row_limit must be a positive integer"
        )
    }
    assertthat::assert_that(
        assertthat::is.flag(as_string)
    )
    COMPARE <- x

    if (length(COMPARE) == 0) {
        outtext <- "No issues were found!\n"
    } else {
        start_text <- paste0("Differences found between the objects!\n\n")
        end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit))
        end_text <- paste0(unlist(end_text), collapse = "")
        outtext <- paste0(start_text, end_text)
    }
    if (as_string) {
        return(strsplit(outtext, "\n")[[1]])
    } else {
        cat(outtext)
        return(invisible(COMPARE))
    }
}

#' Generate unique key name
#'
#' Function to generate a name for the keys if not provided
#'
#' @param BASE base dataset
#' @param COMP comparison dataset
#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
#'
#' @keywords internal
generate_keyname <- function(
    BASE,
    COMP,
    replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
) {
    assertthat::assert_that(
        is(replace_names, "character"),
        msg = "replace_names is not a character vector"
    )

    assertthat::assert_that(
        length(replace_names) != 0,
        msg = "All default row names are in use in BASE/COMPARE. Please provide a KEY argument"
    )


    key_name <- replace_names[1]

    if (!is.null(BASE[[key_name]]) || !is.null(COMP[[key_name]])) {
        key_name <- generate_keyname(BASE, COMP, replace_names[-1])
    }
    return(key_name)
}

1		#' Identify Issue Rows
2		#'
3		#' This function takes a `diffdf` object and a dataframe and subsets
4		#' the `data.frame` for problem rows as identified in the comparison object.
5		#' If \code{vars} has been specified only issue rows associated with those
6		#' variable(s) will be returned.
7		#' @param df dataframe to be subsetted
8		#' @param diff diffdf object
9		#' @param vars (optional) character vector containing names of issue variables to subset dataframe
10		#' on. A value of NULL (default) will be taken to mean available issue variables.
11		#' @examples
12		#' iris2 <- iris
13		#' for (i in 1:3) iris2[i, i] <- 99
14		#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
15		#' diffdf_issuerows(iris, x)
16		#' diffdf_issuerows(iris2, x)
17		#' diffdf_issuerows(iris2, x, vars = "Sepal.Length")
18		#' diffdf_issuerows(iris2, x, vars = c("Sepal.Length", "Sepal.Width"))
19		#' @details
20		#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only
21		#' requirement is that the original variables specified in the keys argument to diffdf
22		#' are present on the dataframe you are subsetting against. However please note that if
23		#' no keys were specified in diffdf then the row number is used. This means using
24		#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in
25		#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
26		#' @export
27		diffdf_issuerows <- function(df, diff, vars = NULL) {
28
29	19x	assertthat::assert_that(
30	19x	class(diff)[[1]] == "diffdf"
31		)
32
33
34	19x	KEYS_ATT <- attr(diff, "keys")
35
36	19x	assertthat::assert_that(
37	19x	!is.null(KEYS_ATT),
38	19x	msg = "diff is missing the keys attribute"
39		)
40
41
42	19x	issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]
43
44	19x	if (is.null(vars)) {
45	15x	vars <- issue_vars
46		} else {
47	4x	vars <- paste0("VarDiff_", vars)
48		}
49
50	19x	if (length(issue_vars) == 0 \|\| sum(vars %in% issue_vars) == 0) {
51	7x	return(df[FALSE, ])
52		}
53
54	12x	KEEP <- mapply(
55	12x	FUN = get_issue_dataset,
56	12x	issue = vars,
57	12x	diff = list(diff),
58	12x	SIMPLIFY = FALSE
59		)
60
61	12x	KEEP <- recursive_reduce(KEEP, rbind)
62	12x	KEEP <- KEEP[!duplicated(KEEP), ]
63
64	12x	if (KEYS_ATT$is_derived) {
65	2x	df[[KEYS_ATT$value]] <- seq_len(nrow(df))
66		}
67
68	12x	keys <- KEYS_ATT$value
69
70	12x	if (any(!keys %in% names(df))) {
71	1x	stop("df does not contain all variables specified as keys in diff")
72		}
73
74	11x	RET <- merge(
75	11x	x = df,
76	11x	y = KEEP,
77	11x	sort = TRUE
78		)
79
80	11x	RET <- RET[do.call("order", RET[keys]), ]
81
82	11x	if (KEYS_ATT$is_derived) {
83	2x	keep_vars <- !names(RET) %in% KEYS_ATT$value
84	2x	RET <- RET[, keep_vars, drop = FALSE]
85		}
86
87	11x	return(RET)
88		}
89
90
91
92
93		#' get_issue_dataset
94		#'
95		#' Internal function used by `diffdf_issuerows` to extract the dataframe
96		#' from each a target issue. In particular it also strips off any
97		#' non-key variables
98		#' @param issue name of issue to extract the dataset from diff
99		#' @param diff diffdf object which contains issues
100		#' @keywords internal
101		get_issue_dataset <- function(issue, diff) {
102	20x	issue_df <- diff[[issue]]
103	20x	keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
104	20x	issue_df[, keep, drop = FALSE]
105		}

1		#' factor_to_character
2		#'
3		#' Takes a dataframe and converts any factor variables to character
4		#' @param dsin input dataframe
5		#' @param vars variables to consider for conversion. Default NULL will consider
6		#' every variable within the dataset
7		#' @keywords internal
8		factor_to_character <- function(dsin, vars = NULL) {
9	!	if (is.null(vars)) vars <- names(dsin)
10
11	224x	for (var in vars) {
12	236x	if (is.factor(dsin[[var]])) {
13	!	dsin[[var]] <- as.character(dsin[[var]])
14		}
15		}
16	224x	return(dsin)
17		}
18
19
20
21
22		#' has_unique_rows
23		#'
24		#' Check if a data sets rows are unique
25		#' @param DAT input data set (data frame)
26		#' @param KEYS Set of keys which should be unique
27		#' @keywords internal
28		has_unique_rows <- function(DAT, KEYS) {
29	235x	DUPS <- duplicated(subset(DAT, select = KEYS))
30	235x	NDUPS <- sum(DUPS)
31	235x	return(NDUPS == 0)
32		}
33
34		#' convert_to_issue
35		#'
36		#' converts the count value into the correct issue format
37		#' @param datin data inputted
38		#' @importFrom tibble rownames_to_column
39		#' @keywords internal
40		convert_to_issue <- function(datin) {
41	106x	datin_tibble <- tibble(
42	106x	`Variable` = names(datin),
43	106x	`No of Differences` = datin
44		)
45
46	106x	datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
47	106x	return(datin_tibble_reduced)
48		}
49
50		#' Describe the datasets being compared
51		#'
52		#' This function is used to produce a basic summary table of the core
53		#' features of the two `data.frame`'s being compared.
54		#' @param base (`data.frame`)\cr base dataset to be described
55		#' @param comp (`data.frame`)\cr comparison dataset to be described
56		#' @param base_name (`character`)\cr name of the base dataset
57		#' @param comp_name (`character`)\cr name of the comparison dataset
58		#' @keywords internal
59		describe_dataframe <- function(base, comp, base_name, comp_name) {
60	124x	tibble(
61	124x	PROPERTY = list(
62	124x	"Name",
63	124x	"Class",
64	124x	"Rows(#)",
65	124x	"Columns(#)"
66		),
67	124x	BASE = c(
68	124x	base_name,
69	124x	paste(class(base), collapse = ", "),
70	124x	as.character(nrow(base)),
71	124x	as.character(ncol(base))
72		),
73	124x	COMP = c(
74	124x	comp_name,
75	124x	paste(class(comp), collapse = ", "),
76	124x	as.character(nrow(comp)),
77	124x	as.character(ncol(comp))
78		)
79		)
80		}

1		#' sort_then_join
2		#'
3		#' Convenience function to sort two strings and paste them together
4		#' @param string1 first string
5		#' @param string2 second string
6		#' @keywords internal
7		sort_then_join <- function(string1, string2) {
8	14x	paste0(sort(c(string1, string2)), collapse = "")
9		}
10
11
12		#' class_merge
13		#'
14		#' Convenience function to put all classes an object has into one string
15		#' @param x an object
16		#' @keywords internal
17		class_merge <- function(x) {
18	304x	paste(class(x), collapse = "_")
19		}
20
21
22		get_message <- function(colname, whichdat, totype) {
23	12x	message(paste0(
24	12x	"NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype
25		))
26		}
27
28
29		#' get_casted_vector
30		#'
31		#' casts a vector depending on its type and input
32		#' @param colin column to cast
33		#' @param colname name of vector
34		#' @param whichdat whether base or compare is being casted (used for messages)
35		#' @importFrom methods is
36		#' @keywords internal
37		get_casted_vector <- function(colin, colname, whichdat) {
38	24x	if (is(colin, "factor")) {
39	6x	get_message(colname, whichdat, "character")
40	6x	return(as.character(colin))
41		}
42
43	18x	if (is(colin, "integer")) {
44	6x	get_message(colname, whichdat, "numeric")
45	6x	return(as.numeric(colin))
46		}
47
48	12x	colin
49		}
50
51
52
53		#' get_casted_dataset
54		#'
55		#' Internal utility function to loop across a dataset casting all target
56		#' variables
57		#' @param df dataset to be casted
58		#' @param columns columns to be casted
59		#' @param whichdat whether base or compare is being casted (used for messages)
60		#' @keywords internal
61		get_casted_dataset <- function(df, columns, whichdat) {
62	22x	for (col in columns) {
63	24x	df[[col]] <- get_casted_vector(df[[col]], col, whichdat)
64		}
65	22x	return(df)
66		}
67
68
69
70
71
72
73		#' cast_variables
74		#'
75		#' Function to cast datasets columns if they have differing types
76		#' Restricted to specific cases, currently integer and double, and character and factor
77		#' @param BASE base dataset
78		#' @param COMPARE comparison dataset
79		#' @param ignore_vars Variables not to be considered for casting
80		#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
81		#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
82		#' @keywords internal
83		cast_variables <- function(
84		BASE,
85		COMPARE,
86		ignore_vars = NULL,
87		cast_integers = FALSE,
88		cast_factors = FALSE
89		) {
90
91	13x	allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]
92
93	13x	BASE_class <- data.frame(
94	13x	class_BASE = sapply(BASE, class_merge),
95	13x	colname = names(BASE),
96	13x	stringsAsFactors = FALSE
97		)
98	13x	BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE]
99
100
101	13x	COMPARE_class <- data.frame(
102	13x	class_COMPARE = sapply(COMPARE, class_merge),
103	13x	colname = names(COMPARE),
104	13x	stringsAsFactors = FALSE
105		)
106	13x	COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE]
107
108	13x	common_class <- merge(
109	13x	x = BASE_class,
110	13x	y = COMPARE_class,
111	13x	by = "colname"
112		)
113
114
115	13x	diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE]
116
117
118	13x	diff_class$classmerge <- mapply(
119	13x	sort_then_join,
120	13x	diff_class$class_COMPARE,
121	13x	diff_class$class_BASE
122		)
123
124
125	13x	cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE]
126
127
128	13x	DATASETS <- list(
129	13x	"BASE" = BASE,
130	13x	"COMPARE" = COMPARE
131		)
132
133	13x	if (nrow(cast_columns) == 0) {
134	2x	return(DATASETS)
135		}
136
137
138	11x	for (i in names(DATASETS)) {
139	22x	DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i)
140		}
141
142	11x	return(DATASETS)
143		}

1		#' is_variable_different
2		#'
3		#' This subsets the data set on the variable name, picks out differences and returns a `tibble`
4		#' of differences for the given variable
5		#' @importFrom tibble as_tibble
6		#' @param variablename name of variable being compared
7		#' @param keynames name of keys
8		#' @param datain Inputted dataset with base and compare vectors
9		#' @param ... Additional arguments which might be passed through (numerical accuracy)
10		#' @return A boolean vector which is T if target and current are different
11		#' @keywords internal
12		is_variable_different <- function(variablename, keynames, datain, ...) {
13	917x	xvar <- paste0(variablename, ".x")
14	917x	yvar <- paste0(variablename, ".y")
15
16	917x	assertthat::assert_that(
17	917x	xvar %in% names(datain) && yvar %in% names(datain),
18	917x	msg = "Variable does not exist within input dataset"
19		)
20
21	917x	target <- datain[[xvar]]
22	917x	current <- datain[[yvar]]
23	917x	outvect <- find_difference(target, current, ...)
24
25	917x	datain[["VARIABLE"]] <- variablename
26
27	917x	names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
28
29	917x	x <- as_tibble(
30	917x	subset(
31	917x	datain,
32	917x	outvect,
33	917x	select = c("VARIABLE", keynames, "BASE", "COMPARE")
34		)
35		)
36
37	917x	return(x)
38		}
39
40		#' compare_vectors
41		#'
42		#' Compare two vectors looking for differences
43		#'
44		#' @param target the base vector
45		#' @param current a vector to compare target to
46		#' @param ... Additional arguments which might be passed through (numerical accuracy)
47		#' @keywords internal
48		compare_vectors <- function(target, current, ...) {
49	950x	UseMethod("compare_vectors")
50		}
51
52
53		#' find_difference
54		#'
55		#' This determines if two vectors are different. It expects vectors of the same
56		#' length and type, and is intended to be used after checks have already been done
57		#' Initially picks out any `NA`'s (matching `NA`'s count as a match)
58		#' Then compares remaining vector
59		#'
60		#' @param target the base vector
61		#' @param current a vector to compare target to
62		#' @param ... Additional arguments which might be passed through (numerical accuracy)
63		#' @keywords internal
64		find_difference <- function(target, current, ...) {
65	953x	if (length(target) != length(current)) {
66	2x	warning("Inputs are not of the same length")
67	2x	return(NULL)
68		}
69
70	951x	if (is.null(target) \|\| is.null(current)) {
71	1x	return(is.null(target) != is.null(current))
72		}
73
74		### Initalise output, assume problem unless evidence otherwise
75	950x	return_vector <- rep(TRUE, length(target))
76
77	950x	nas_t <- is.na(target)
78	950x	nas_c <- is.na(current)
79
80		## compare missing values
81	950x	nacompare <- nas_t != nas_c
82	950x	naselect <- nas_t \| nas_c
83	950x	return_vector[naselect] <- nacompare[naselect]
84
85		## compare non-missing values
86	950x	selectvector <- as.logical((!nas_t) * (!nas_c))
87
88	950x	comparevect <- compare_vectors(
89	950x	target[selectvector],
90	950x	current[selectvector],
91		...
92		)
93
94	950x	return_vector[selectvector] <- comparevect
95
96	950x	return(return_vector)
97		}
98
99
100
101
102
103
104
105		#' compare_vectors.default
106		#'
107		#' Default method, if the vector is not numeric or factor. Basic comparison
108		#' @param target the base vector
109		#' @param current a vector to compare target to
110		#' @param ... Additional arguments which might be passed through (numerical accuracy)
111		#' @keywords internal
112		compare_vectors.default <- function(target, current, ...) {
113	408x	target != current
114		}
115
116
117
118
119		#' compare_vectors.factor
120		#'
121		#' Compares factors. Sets them as character and then compares
122		#' @param target the base vector
123		#' @param current a vector to compare target to
124		#' @param ... Additional arguments which might be passed through (numerical accuracy)
125		#' @keywords internal
126		compare_vectors.factor <- function(target, current, ...) {
127	83x	as.character(target) != as.character(current)
128		}
129
130
131
132
133
134		#' compare_vectors.numeric
135		#'
136		#' This is a modified version of the all.equal function
137		#' which returns a vector rather than a message
138		#' @param target the base vector
139		#' @param current a vector to compare target to
140		#' @param tolerance Level of tolerance for differences between two variables
141		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
142		#' @param ... Not used
143		#' @keywords internal
144		compare_vectors.numeric <- function(
145		target,
146		current,
147		tolerance = sqrt(.Machine$double.eps),
148		scale = NULL,
149		...
150		) {
151
152	459x	out <- target == current
153
154	459x	if (all(out)) {
155	433x	return(!out)
156		}
157
158	26x	if (is.integer(target) \|\| is.integer(current)) {
159	7x	target <- as.double(target)
160	7x	current <- as.double(current)
161		}
162
163	26x	xy <- abs(target - current)
164
165	26x	if (!is.null(scale)) {
166	4x	xy <- xy / scale
167		}
168
169	26x	return(xy > tolerance)
170		}
171
172		#' compare_vectors.int64
173		#'
174		#' Handle int64 vectors. Uses numeric comparison
175		#' @param target the base vector
176		#' @param current a vector to compare target to
177		#' @param tolerance Level of tolerance for differences between two variables
178		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
179		#' @param ... Not used
180		#' @keywords internal
181		compare_vectors.integer64 <- function(
182		target,
183		current,
184		tolerance = sqrt(.Machine$double.eps),
185		scale = NULL,
186		...
187		) {
188	4x	compare_vectors.numeric(target, current, tolerance, scale)
189		}

1		#' construct_issue
2		#'
3		#' Make an s3 object with class issue and possible additional class,
4		#' and assign other arguments to attributes
5		#' @param value the value of the object
6		#' @param message the value of the message attribute
7		#' @param add_class additional class to add
8		#' @keywords internal
9		construct_issue <- function(value, message, add_class = NULL) {
10	2180x	x <- value
11
12		### If nothing has been provided return nothing !
13	2180x	if (nrow(x) == 0) {
14	1913x	return(NULL)
15		}
16
17	267x	class(x) <- c(add_class, "issue", class(x))
18	267x	attributes(x)[["message"]] <- message
19	267x	return(x)
20		}
21
22
23		#' get_issue_message
24		#'
25		#' Simple function to grab the issue message
26		#' @param object inputted object of class issue
27		#' @param ... other arguments
28		#' @keywords internal
29		get_issue_message <- function(object, ...) {
30	134x	return(attr(object, "message"))
31		}
32
33
34		#' get_print_message
35		#'
36		#' Get the required text depending on type of issue
37		#' @param object inputted object of class issue
38		#' @param ... other arguments
39		#' @keywords internal
40		get_print_message <- function(object, ...) {
41	37x	UseMethod("get_print_message", object)
42		}
43
44
45		#' get_print_message.default
46		#'
47		#' Errors, as this should only ever be given an issue
48		#' @param object issue
49		#' @param ... Not used
50		#' @keywords internal
51		get_print_message.default <- function(object, ...) {
52	!	stop("Error: An issue has not been provided to this function!")
53		}
54
55
56		#' get_print_message.issue
57		#'
58		#' Get text from a basic issue, based on the class of the value of the issue
59		#'
60		#' @param object an object of class issue_basic
61		#' @inheritParams print.diffdf
62		#' @keywords internal
63		get_print_message.issue <- function(object, row_limit, ...) {
64	37x	paste(
65	37x	c(
66	37x	attr(object, "message"),
67	37x	get_table(object, row_limit = row_limit),
68	37x	"\n\n"
69		),
70	37x	collapse = "\n"
71		)
72		}

1		#' diffdf
2		#' @description
3		#' Compares 2 dataframes and outputs any differences.
4		#' @param base input dataframe
5		#' @param compare comparison dataframe
6		#' @param keys vector of variables (as strings) that defines a unique row in
7		#' the base and compare dataframes
8		#' @param strict_numeric Flag for strict numeric to numeric comparisons
9		#' (default = TRUE). If False diffdf will cast integer to double where
10		#' required for comparisons. Note that variables specified in the keys
11		#' will never be casted.
12		#' @param strict_factor Flag for strict factor to character comparisons
13		#' (default = TRUE). If False diffdf will cast factors to characters where
14		#' required for comparisons. Note that variables specified in the keys will
15		#' never be casted.
16		#' @param suppress_warnings Do you want to suppress warnings? (logical)
17		#' @param file Location and name of a text file to output the results to.
18		#' Setting to NULL will cause no file to be produced.
19		#' @param tolerance Set tolerance for numeric comparisons. Note that
20		#' comparisons fail if (x-y)/scale > tolerance.
21		#' @param scale Set scale for numeric comparisons. Note that comparisons fail
22		#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
23		#' version of scale = 1.
24		#' @param check_column_order Should the column ordering be checked? (logical)
25		#' @param check_df_class Do you want to check for differences in the class
26		#' between `base` and `compare`? (logical)
27		#' @examples
28		#' x <- subset(iris, -Species)
29		#' x[1, 2] <- 5
30		#' COMPARE <- diffdf(iris, x)
31		#' print(COMPARE)
32		#'
33		#' #### Sample data frames
34		#'
35		#' DF1 <- data.frame(
36		#' id = c(1, 2, 3, 4, 5, 6),
37		#' v1 = letters[1:6],
38		#' v2 = c(NA, NA, 1, 2, 3, NA)
39		#' )
40		#'
41		#' DF2 <- data.frame(
42		#' id = c(1, 2, 3, 4, 5, 7),
43		#' v1 = letters[1:6],
44		#' v2 = c(NA, NA, 1, 2, NA, NA),
45		#' v3 = c(NA, NA, 1, 2, NA, 4)
46		#' )
47		#'
48		#' diffdf(DF1, DF1, keys = "id")
49		#'
50		#' # We can control matching with scale/location for example:
51		#'
52		#' DF1 <- data.frame(
53		#' id = c(1, 2, 3, 4, 5, 6),
54		#' v1 = letters[1:6],
55		#' v2 = c(1, 2, 3, 4, 5, 6)
56		#' )
57		#' DF2 <- data.frame(
58		#' id = c(1, 2, 3, 4, 5, 6),
59		#' v1 = letters[1:6],
60		#' v2 = c(1.1, 2, 3, 4, 5, 6)
61		#' )
62		#'
63		#' diffdf(DF1, DF2, keys = "id")
64		#' diffdf(DF1, DF2, keys = "id", tolerance = 0.2)
65		#' diffdf(DF1, DF2, keys = "id", scale = 10, tolerance = 0.2)
66		#'
67		#' # We can use strict_factor to compare factors with characters for example:
68		#'
69		#' DF1 <- data.frame(
70		#' id = c(1, 2, 3, 4, 5, 6),
71		#' v1 = letters[1:6],
72		#' v2 = c(NA, NA, 1, 2, 3, NA),
73		#' stringsAsFactors = FALSE
74		#' )
75		#'
76		#' DF2 <- data.frame(
77		#' id = c(1, 2, 3, 4, 5, 6),
78		#' v1 = letters[1:6],
79		#' v2 = c(NA, NA, 1, 2, 3, NA)
80		#' )
81		#'
82		#' diffdf(DF1, DF2, keys = "id", strict_factor = TRUE)
83		#' diffdf(DF1, DF2, keys = "id", strict_factor = FALSE)
84		#'
85		#' @export
86		diffdf <- function(
87		base,
88		compare,
89		keys = NULL,
90		suppress_warnings = FALSE,
91		strict_numeric = TRUE,
92		strict_factor = TRUE,
93		file = NULL,
94		tolerance = sqrt(.Machine$double.eps),
95		scale = NULL,
96		check_column_order = FALSE,
97		check_df_class = FALSE
98		) {
99
100	124x	assertthat::assert_that(
101	124x	assertthat::is.flag(check_df_class),
102	124x	!is.na(check_df_class),
103	124x	msg = "`check_df_class` must be a length 1 logical"
104		)
105
106	124x	BASE <- base
107	124x	COMP <- compare
108	124x	KEYS <- keys
109	124x	SUPWARN <- suppress_warnings
110
111		### Initatiate output object
112	124x	COMPARE <- list()
113	124x	class(COMPARE) <- c("diffdf", "list")
114
115
116	124x	BASE_NAME <- deparse(substitute(base))
117	124x	COMP_NAME <- deparse(substitute(compare))
118	124x	COMPARE[["DataSummary"]] <- construct_issue(
119	124x	value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME),
120	124x	message = "Summary of BASE and COMPARE"
121		)
122
123
124	124x	is_derived <- FALSE
125
126		### If no key is suplied match values based upon row number
127	124x	if (is.null(KEYS)) {
128	102x	is_derived <- TRUE
129	102x	keyname <- generate_keyname(BASE, COMP)
130	102x	BASE[[keyname]] <- seq_len(nrow(BASE))
131	102x	COMP[[keyname]] <- seq_len(nrow(COMP))
132	102x	KEYS <- keyname
133		}
134	124x	attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)
135
136	124x	assertthat::assert_that(
137	124x	is.numeric(tolerance),
138	124x	is.numeric(scale) \|\| is.null(scale)
139		)
140
141	120x	missing_keys_base <- KEYS[!KEYS %in% names(BASE)]
142	120x	assertthat::assert_that(
143	120x	length(missing_keys_base) == 0,
144	120x	msg = sprintf(
145	120x	"The following KEYS are not available in BASE:\n %s",
146	120x	paste(missing_keys_base, collapse = "\n ")
147		)
148		)
149
150	119x	missing_keys_comp <- KEYS[!KEYS %in% names(COMP)]
151	119x	assertthat::assert_that(
152	119x	length(missing_keys_comp) == 0,
153	119x	msg = sprintf(
154	119x	"The following KEYS are not available in COMPARE:\n %s",
155	119x	paste(missing_keys_comp, collapse = "\n ")
156		)
157		)
158
159	118x	assertthat::assert_that(
160	118x	has_unique_rows(BASE, KEYS),
161	118x	msg = "BY variables in BASE do not result in unique observations"
162		)
163
164	117x	assertthat::assert_that(
165	117x	has_unique_rows(COMP, KEYS),
166	117x	msg = "BY variables in COMPARE do not result in unique observations"
167		)
168
169
170
171		#### Check essential variable properties (class & mode)
172
173	117x	COMPARE[["UnsupportedColsBase"]] <- construct_issue(
174	117x	value = identify_unsupported_cols(BASE),
175	117x	message = "There are columns in BASE with unsupported modes !!"
176		)
177
178
179	117x	COMPARE[["UnsupportedColsComp"]] <- construct_issue(
180	117x	value = identify_unsupported_cols(COMP),
181	117x	message = "There are columns in COMPARE with unsupported modes !!"
182		)
183
184
185		# cast variables if strict is off
186	117x	if (!strict_factor \|\| !strict_numeric) {
187	9x	casted_df <- cast_variables(
188	9x	BASE = BASE,
189	9x	COMPARE = COMP,
190	9x	ignore_vars = KEYS,
191	9x	cast_integers = !strict_numeric,
192	9x	cast_factors = !strict_factor
193		)
194
195	9x	BASE <- casted_df$BASE
196	9x	COMP <- casted_df$COMP
197		}
198
199
200	117x	COMPARE[["VarModeDiffs"]] <- construct_issue(
201	117x	value = identify_mode_differences(BASE, COMP),
202	117x	message = "There are columns in BASE and COMPARE with different modes !!"
203		)
204
205
206	117x	COMPARE[["VarClassDiffs"]] <- construct_issue(
207	117x	value = identify_class_differences(BASE, COMP),
208	117x	message = "There are columns in BASE and COMPARE with different classes !!"
209		)
210
211
212
213
214		##### Check Validity of Keys
215
216	117x	BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
217	117x	COMP_keys <- names(COMP)[names(COMP) %in% KEYS]
218
219	117x	assertthat::assert_that(
220	117x	length(BASE_keys) == length(KEYS),
221	117x	msg = "BASE is missing variables specified in KEYS"
222		)
223
224	117x	assertthat::assert_that(
225	117x	length(COMP_keys) == length(KEYS),
226	117x	msg = "COMP is missing variables specified in KEYS"
227		)
228
229
230	117x	assert_valid_keys(
231	117x	COMPARE, KEYS, "UnsupportedColsBase",
232	117x	"The following KEYS in BASE have an unsupported mode (see `?mode()`)"
233		)
234	116x	assert_valid_keys(
235	116x	COMPARE, KEYS, "UnsupportedColsComp",
236	116x	"The following KEYS in COMPARE have an unsupported mode (see `?mode()`)"
237		)
238	115x	assert_valid_keys(
239	115x	COMPARE, KEYS, "VarModeDiffs",
240	115x	"The following KEYS have different modes between BASE and COMPARE"
241		)
242	113x	assert_valid_keys(
243	113x	COMPARE, KEYS, "VarClassDiffs",
244	113x	"The following KEYS have different classes between BASE and COMPARE"
245		)
246
247
248	112x	exclude_cols <- c(
249	112x	COMPARE[["UnsupportedColsBase"]]$VARIABLE,
250	112x	COMPARE[["UnsupportedColsComp"]]$VARIABLE,
251	112x	COMPARE[["VarClassDiffs"]]$VARIABLE,
252	112x	COMPARE[["VarModeDiffs"]]$VARIABLE
253		)
254
255	112x	if (check_column_order) {
256	5x	if (attr(COMPARE, "keys")$is_derived) {
257	4x	keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value)
258	4x	keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value)
259		} else {
260	1x	keep_vars_base <- TRUE
261	1x	keep_vars_comp <- TRUE
262		}
263	5x	COMPARE[["ColumnOrder"]] <- construct_issue(
264	5x	value = identify_column_order_differences(
265	5x	BASE[, keep_vars_base, drop = FALSE],
266	5x	COMP[, keep_vars_comp, drop = FALSE]
267		),
268	5x	message = "There are differences in the column ordering between BASE and COMPARE !!"
269		)
270		}
271
272
273		##### Check Attributes
274	112x	COMPARE[["AttribDiffs"]] <- construct_issue(
275	112x	value = identify_att_differences(BASE, COMP, exclude_cols),
276	112x	message = "There are columns in BASE and COMPARE with differing attributes !!"
277		)
278
279
280		##### Check data
281
282	112x	BASE <- factor_to_character(BASE, KEYS)
283	112x	COMP <- factor_to_character(COMP, KEYS)
284
285
286	112x	COMPARE[["ExtRowsBase"]] <- construct_issue(
287	112x	value = identify_extra_rows(BASE, COMP, KEYS),
288	112x	message = "There are rows in BASE that are not in COMPARE !!"
289		)
290
291
292	112x	COMPARE[["ExtRowsComp"]] <- construct_issue(
293	112x	value = identify_extra_rows(COMP, BASE, KEYS),
294	112x	message = "There are rows in COMPARE that are not in BASE !!"
295		)
296
297
298	112x	COMPARE[["ExtColsBase"]] <- construct_issue(
299	112x	value = identify_extra_cols(BASE, COMP),
300	112x	message = "There are columns in BASE that are not in COMPARE !!"
301		)
302
303
304	112x	COMPARE[["ExtColsComp"]] <- construct_issue(
305	112x	value = identify_extra_cols(COMP, BASE),
306	112x	message = "There are columns in COMPARE that are not in BASE !!"
307		)
308
309
310	112x	VALUE_DIFFERENCES <- identify_differences(
311	112x	BASE, COMP, KEYS, exclude_cols,
312	112x	tolerance = tolerance,
313	112x	scale = scale
314		)
315
316
317
318		## Summarise the number of mismatching rows per variable
319
320	112x	if (length(VALUE_DIFFERENCES)) {
321	106x	NDIFF <- sapply(VALUE_DIFFERENCES, nrow)
322	106x	COMPARE[["NumDiff"]] <- construct_issue(
323	106x	value = convert_to_issue(NDIFF),
324	106x	message = "Not all Values Compared Equal"
325		)
326		}
327
328
329	112x	for (i in names(VALUE_DIFFERENCES)) {
330	917x	COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
331	917x	value = VALUE_DIFFERENCES[[i]],
332	917x	message = NULL
333		)
334		}
335
336
337		# suppress warning message of data summary if user didn't request to check it
338		# we leave the issue in the main compare object though for printing purposes
339	112x	COMPARE_WARNINGS <- COMPARE
340	112x	attr(COMPARE_WARNINGS[["DataSummary"]], "message") <- c(
341	112x	"There are differences between the class of BASE and COMPARE"
342		)
343	112x	if (!check_df_class \|\| identical(class(base), class(compare))) {
344	110x	COMPARE_WARNINGS["DataSummary"] <- NULL
345		}
346
347		# Get all issue messages, remove blank message, and collapse into single string
348	112x	ISSUE_MSGS <- sapply(COMPARE_WARNINGS, function(x) get_issue_message(x))
349	112x	ISSUE_MSGS <- Filter(function(x) !is.null(x), ISSUE_MSGS)
350	112x	ISSUE_MSGS <- Filter(function(x) x != "", ISSUE_MSGS)
351
352	112x	if (length(ISSUE_MSGS) != 0) {
353	73x	if (!SUPWARN) {
354	45x	ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
355	45x	warning(c("\n", ISSUE_MSGS))
356		}
357		}
358
359		# If the classes are the same and it is the only entry in the compare
360		# object then remove it in order to trigger "no issues found"
361	112x	if (identical(class(base), class(compare)) && length(COMPARE) == 1) {
362	38x	COMPARE["DataSummary"] <- NULL
363		}
364
365		# If the summary is the only item and the user didn't want to check classes
366		# then remove the object to trigger the "no issues found"
367	112x	if (!check_df_class && length(COMPARE) == 1) {
368	1x	COMPARE["DataSummary"] <- NULL
369		}
370
371
372	112x	if (!is.null(file)) {
373	!	x <- print(COMPARE, as_string = TRUE)
374
375	!	tryCatch(
376		{
377	!	sink(file)
378	!	cat(x, sep = "\n")
379	!	sink()
380		},
381	!	warning = function(w) {
382	!	sink()
383	!	warning(w)
384		},
385	!	error = function(e) {
386	!	sink()
387	!	stop(e)
388		}
389		)
390	!	return(invisible(COMPARE))
391		}
392
393	112x	return(COMPARE)
394		}
395
396
397
398
399		#' diffdf_has_issues
400		#'
401		#' Utility function which returns TRUE if an diffdf
402		#' object has issues or FALSE if an diffdf object does not have issues
403		#' @param x diffdf object
404		#' @examples
405		#'
406		#' # Example with no issues
407		#' x <- diffdf(iris, iris)
408		#' diffdf_has_issues(x)
409		#'
410		#' # Example with issues
411		#' iris2 <- iris
412		#' iris2[2, 2] <- NA
413		#' x <- diffdf(iris, iris2, suppress_warnings = TRUE)
414		#' diffdf_has_issues(x)
415		#' @export
416		diffdf_has_issues <- function(x) {
417	!	if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
418	3x	return(length(x) != 0)
419		}
420
421
422		#' Assert that keys are valid
423		#'
424		#' Utility function to check that user provided "keys" aren't listed as a problem
425		#' variable of the current list of issues.
426		#' @param COMPARE (`list`)\cr A named list of which each element is a `data.frame` with the
427		#' column `VARIABLE`
428		#' @param KEYS (`character`)\cr name of key variables to check to make sure they don't contain
429		#' any issues
430		#' @param component (`character`)\cr name of the component within `COMPARE` to check against
431		#' @param msg (`character`)\cr error message to print if any of `KEYS` are found within
432		#' `COMPARE[component]$VARIABLE`
433		#' @keywords internal
434		assert_valid_keys <- function(COMPARE, KEYS, component, msg) {
435	461x	keys_reduced <- KEYS[KEYS %in% COMPARE[[component]]$VARIABLE]
436	461x	assertthat::assert_that(
437	461x	length(keys_reduced) == 0,
438	461x	msg = sprintf(
439	461x	"%s:\n%s",
440	461x	msg,
441	461x	paste0("`", paste0(keys_reduced, collapse = "`, `"), "`")
442		)
443		)
444		}

1		#' identify_extra_rows
2		#'
3		#' Identifies rows that are in a baseline dataset but not in a comparator dataset
4		#' @param DS1 Baseline dataset (data frame)
5		#' @param DS2 Comparator dataset (data frame)
6		#' @param KEYS List of variables that define a unique row within the datasets (strings)
7		#' @keywords internal
8		identify_extra_rows <- function(DS1, DS2, KEYS) {
9	224x	if (nrow(DS2) == 0 \|\| nrow(DS1) == 0) {
10	10x	return(DS1[, KEYS, drop = FALSE])
11		}
12	214x	DS2[["..FLAG.."]] <- "Y"
13	214x	dat <- merge(
14	214x	subset(DS1, select = KEYS),
15	214x	subset(DS2, select = c(KEYS, "..FLAG..")),
16	214x	by = KEYS, all.x = TRUE,
17	214x	sort = TRUE
18		)
19	214x	dat <- dat[do.call("order", dat[KEYS]), ]
20
21	214x	dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
22		}
23
24
25
26		#' identify_extra_cols
27		#'
28		#' Identifies columns that are in a baseline dataset but not in a comparator dataset
29		#' @param DS1 Baseline dataset (data frame)
30		#' @param DS2 Comparator dataset (data frame)
31		#' @importFrom tibble tibble
32		#' @keywords internal
33		identify_extra_cols <- function(DS1, DS2) {
34	224x	match.cols <- sapply(names(DS1), "%in%", names(DS2))
35	224x	assertthat::assert_that(
36	224x	all(is.logical(match.cols)),
37	224x	msg = "Assumption of logical return type is not true"
38		)
39	224x	tibble(
40	224x	COLUMNS = names(DS1)[!match.cols]
41		)
42		}
43
44
45
46
47
48
49
50		#' identify_matching_cols
51		#'
52		#' Identifies columns with the same name in two data frames
53		#' @param DS1 Input dataset 1 (data frame)
54		#' @param DS2 Input dataset 2 (data frame)
55		#' @param EXCLUDE Columns to ignore
56		#' @keywords internal
57		identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
58	768x	match_cols <- sapply(names(DS1), "%in%", names(DS2))
59	768x	exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
60	768x	names(DS1)[match_cols & !exclude_cols]
61		}
62
63
64
65
66
67		#' identify_unsupported_cols
68		#'
69		#' Identifies any columns for which the package is not setup to handle
70		#' @param dsin input dataset
71		#' @keywords internal
72		identify_unsupported_cols <- function(dsin) {
73	234x	dat <- subset(
74	234x	identify_properties(dsin),
75	234x	select = c("VARIABLE", "MODE")
76		)
77
78	234x	dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]
79		}
80
81
82
83		#' identify_mode_differences
84		#'
85		#' Identifies any mode differences between two data frames
86		#' @param BASE Base dataset for comparison (data.frame)
87		#' @param COMP Comparator dataset to compare base against (data.frame)
88		#' @keywords internal
89		identify_mode_differences <- function(BASE, COMP) {
90	417x	matching_cols <- identify_matching_cols(BASE, COMP)
91
92	417x	dat <- merge(
93	417x	x = identify_properties(BASE),
94	417x	y = identify_properties(COMP),
95	417x	by = "VARIABLE",
96	417x	all = TRUE,
97	417x	suffixes = c(".BASE", ".COMP"),
98	417x	sort = TRUE
99		)
100	417x	dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))
101
102	417x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
103	417x	KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]
104
105	417x	dat[KEEP1 & KEEP2, , drop = FALSE]
106		}
107
108
109
110		#' identify_class_differences
111		#'
112		#' Identifies any class differences between two data frames
113		#' @param BASE Base dataset for comparison (data.frame)
114		#' @param COMP Comparator dataset to compare base against (data.frame)
115		#' @keywords internal
116		identify_class_differences <- function(BASE, COMP) {
117	117x	matching_cols <- identify_matching_cols(BASE, COMP)
118
119	117x	dat <- merge(
120	117x	x = identify_properties(BASE),
121	117x	y = identify_properties(COMP),
122	117x	by = "VARIABLE",
123	117x	all = TRUE,
124	117x	sort = TRUE,
125	117x	suffixes = c(".BASE", ".COMP")
126		)
127
128	117x	dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))
129
130	117x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
131	117x	KEEP2 <- !mapply(
132	117x	identical,
133	117x	dat[["CLASS.BASE"]],
134	117x	dat[["CLASS.COMP"]]
135		)
136
137	117x	dat[KEEP1 & KEEP2, , drop = FALSE]
138		}
139
140
141
142		#' Identify differences in attributes
143		#'
144		#' Identifies any attribute differences between two data frames
145		#' @param BASE Base dataset for comparison (data.frame)
146		#' @param COMP Comparator dataset to compare base against (data.frame)
147		#' @param exclude_cols Columns to exclude from comparison
148		#' @importFrom tibble tibble
149		#' @keywords internal
150		identify_att_differences <- function(BASE, COMP, exclude_cols = "") {
151	122x	matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)
152
153	122x	PROPS <- merge(
154	122x	x = identify_properties(BASE),
155	122x	y = identify_properties(COMP),
156	122x	by = "VARIABLE",
157	122x	all = TRUE,
158	122x	sort = TRUE,
159	122x	suffixes = c(".BASE", ".COMP")
160		)
161
162	122x	PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))
163
164	122x	PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]
165
166
167		### Setup dummy return value
168	122x	RETURN <- tibble(
169	122x	VARIABLE = character(),
170	122x	ATTR_NAME = character(),
171	122x	VALUES.BASE = list(),
172	122x	VALUES.COMP = list()
173		)
174
175	122x	for (i in PROPS[["VARIABLE"]]) {
176	1157x	PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]
177
178		### Get a vector of all available attributes across both variables
179	1157x	ATTRIB_NAMES <- unique(c(
180	1157x	names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
181	1157x	names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
182		))
183
184		### If variable has no attributes move onto the next variable
185	850x	if (is.null(ATTRIB_NAMES)) next()
186
187		### Loop over each attribute checking if they are identical and outputing
188		### anyones that arn't
189	307x	for (j in ATTRIB_NAMES) {
190	528x	ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
191	528x	ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]
192
193	528x	if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {
194	64x	ATT_DIFFS <- tibble(
195	64x	VARIABLE = i,
196	64x	ATTR_NAME = j,
197	64x	VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
198	64x	VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
199		)
200
201	64x	RETURN <- rbind(RETURN, ATT_DIFFS)
202		}
203		}
204		}
205	122x	return(RETURN)
206		}
207
208
209
210
211
212		#' identify_differences
213		#'
214		#' Compares each column within 2 datasets to identify any values which they
215		#' mismatch on.
216		#' @param BASE Base dataset for comparison (data.frame)
217		#' @param COMP Comparator dataset to compare base against (data.frame)
218		#' @param KEYS List of variables that define a unique row within the datasets (strings)
219		#' @param exclude_cols Columns to exclude from comparison
220		#' @param tolerance Level of tolerance for numeric differences between two variables
221		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
222		#' @keywords internal
223		identify_differences <- function(
224		BASE,
225		COMP,
226		KEYS,
227		exclude_cols,
228		tolerance = sqrt(.Machine$double.eps),
229		scale = NULL
230		) {
231
232	112x	matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))
233
234	112x	if (length(matching_cols) == 0) {
235	2x	return(tibble())
236		}
237
238	110x	DAT <- merge(
239	110x	x = BASE,
240	110x	y = COMP,
241	110x	by = KEYS,
242	110x	suffix = c(".x", ".y"),
243	110x	sort = TRUE
244		)
245	110x	if (nrow(DAT) == 0) {
246	4x	return(tibble())
247		}
248	106x	DAT <- DAT[do.call("order", DAT[KEYS]), ]
249
250	106x	matching_list <- mapply(
251	106x	is_variable_different,
252	106x	matching_cols,
253	106x	MoreArgs = list(
254	106x	keynames = KEYS,
255	106x	datain = DAT,
256	106x	tolerance = tolerance,
257	106x	scale = scale
258		),
259	106x	SIMPLIFY = FALSE
260		)
261
262	106x	matching_list
263		}
264
265
266
267
268
269
270
271
272
273		#' identify_properties
274		#'
275		#' Returns a dataframe of metadata for a given dataset.
276		#' Returned values include variable names , class , mode , type & attributes
277		#' @param dsin input dataframe that you want to get the metadata from
278		#' @importFrom tibble tibble
279		#' @keywords internal
280		identify_properties <- function(dsin) {
281		### If missing or null return empty dataset
282	1546x	if (is.null(dsin)) {
283	!	x <- tibble(
284	!	VARIABLE = character(),
285	!	CLASS = list(),
286	!	MODE = character(),
287	!	TYPE = character(),
288	!	ATTRIBS = list()
289		)
290	!	return(x)
291		}
292
293	1546x	tibble(
294	1546x	VARIABLE = names(dsin),
295	1546x	CLASS = lapply(dsin, class),
296	1546x	MODE = sapply(dsin, mode),
297	1546x	TYPE = sapply(dsin, typeof),
298	1546x	ATTRIBS = lapply(dsin, attributes)
299		)
300		}
301
302
303		#' Find column ordering differences
304		#'
305		#' Compares two datasets and outputs a table listing any differences in the column
306		#' orders between the two datasets. Columns that are not contained within both
307		#' are ignored however column ordering is derived prior to removing these columns.
308		#'
309		#' @param BASE (`data.frame`)\cr Base dataset for comparison
310		#' @param COMP (`data.frame`)\cr Comparator dataset to compare base against
311		#' @keywords internal
312		identify_column_order_differences <- function(BASE, COMP) {
313	8x	base_cols <- tibble(
314	8x	COLUMN = names(BASE),
315	8x	"BASE-INDEX" = seq_along(names(BASE))
316		)
317	8x	comp_cols <- tibble(
318	8x	COLUMN = names(COMP),
319	8x	"COMPARE-INDEX" = seq_along(names(COMP))
320		)
321	8x	col_index <- merge(
322	8x	base_cols,
323	8x	comp_cols,
324	8x	by = c("COLUMN"),
325	8x	all = TRUE,
326	8x	sort = FALSE
327		)
328	8x	keep_rows <- col_index[["BASE-INDEX"]] != col_index[["COMPARE-INDEX"]]
329	8x	keep_rows[is.na(keep_rows)] <- FALSE
330	8x	col_index[keep_rows, , drop = FALSE]
331		}

1		#' Pad String
2		#'
3		#' Utility function used to replicate `str_pad`. Adds white space to either end
4		#' of a string to get it to equal the desired length
5		#' @param x string
6		#' @param width desired length
7		#' @keywords internal
8		string_pad <- function(x, width) {
9	1027x	if (nchar(x) >= width) {
10	!	return(x)
11		}
12	1027x	width <- width - nchar(x)
13	1027x	left <- paste0(rep(" ", floor(width / 2)), collapse = "")
14	1027x	right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
15	1027x	paste0(left, x, right, collapse = "")
16		}
17
18
19		#' recursive_reduce
20		#'
21		#' Utility function used to replicated `purrr::reduce`. Recursively applies a
22		#' function to a list of elements until only 1 element remains
23		#' @param .l list of values to apply a function to
24		#' @param .f function to apply to each each element of the list in turn. See details.
25		#' @details
26		#' This function is essentially performing the following operation:
27		#' ```
28		#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
29		#' ```
30		#' @keywords internal
31		recursive_reduce <- function(.l, .f) {
32	365x	if (length(.l) != 1) {
33	233x	.l[[2]] <- .f(.l[[1]], .l[[2]])
34	233x	return(recursive_reduce(.l[-1], .f))
35		} else {
36	132x	return(.l[[1]])
37		}
38		}
39
40		#' invert
41		#'
42		#' Utility function used to replicated `purrr::transpose`. Turns a list inside
43		#' out.
44		#' @param x list
45		#' @keywords internal
46		invert <- function(x) {
47	40x	x2 <- list()
48	40x	cnames <- names(x)
49	40x	tnames <- names(x[[1]])
50	40x	for (i in tnames) {
51	120x	x2[[i]] <- list()
52	120x	for (j in cnames) {
53	345x	x2[[i]][[j]] <- x[[j]][[i]]
54		}
55		}
56	40x	return(x2)
57		}
58
59
60
61
62		#' as_ascii_table
63		#'
64		#' This function takes a `data.frame` and attempts to convert it into
65		#' a simple ascii format suitable for printing to the screen
66		#' It is assumed all variable values have a `as.character()` method
67		#' in order to cast them to character.
68		#' @param dat Input dataset to convert into a ascii table
69		#' @param line_prefix Symbols to prefix in front of every line of the table
70		#' @keywords internal
71		as_ascii_table <- function(dat, line_prefix = " ") {
72	40x	n_col <- ncol(dat)
73	40x	n_row <- nrow(dat)
74
75		## Convert every value to character and crop to a suitable length
76	40x	dat_char <- lapply(dat, as_fmt_char)
77
78
79	40x	hold <- list()
80	40x	COLS <- colnames(dat)
81
82		### For each column extract core elements (width, values , title) and pad out
83		### each string to be a suitable length
84	40x	for (i in seq_len(n_col)) {
85	115x	COL <- COLS[i]
86	115x	VALUES <- dat_char[[i]]
87
88	115x	JOINT <- c(COL, VALUES)
89	115x	WIDTH <- max(sapply(JOINT, nchar)) + 2
90
91	115x	hold[[COL]] <- list()
92	115x	hold[[COL]]$WIDTH <- WIDTH
93	115x	hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH)
94	115x	hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH)
95		}
96
97		### Collapse into a single value per component ( title , values, width )
98	40x	thold <- invert(hold)
99	40x	tvals <- recursive_reduce(thold$VALUES, paste0)
100	40x	thead <- recursive_reduce(thold$HEADER, paste0)
101	40x	twidth <- recursive_reduce(thold$WIDTH, sum)
102
103		### Create header and footer lines
104	40x	TLINE <- paste0(rep("=", twidth), collapse = "")
105	40x	LINE <- paste0(rep("-", twidth), collapse = "")
106	40x	FVALS <- paste0(line_prefix, tvals, collapse = "\n")
107
108		### Output table
109	40x	paste0(
110	40x	line_prefix, TLINE, "\n",
111	40x	line_prefix, thead, "\n",
112	40x	line_prefix, LINE, "\n",
113	40x	FVALS, "\n",
114	40x	line_prefix, LINE
115		)
116		}
117
118
119		#' as_character
120		#'
121		#' Stub function to enable mocking in unit tests
122		as_character <- as.character
123
124		#' Format vector to printable string
125		#'
126		#' Coerces a vector of any type into a printable string. The most
127		#' significant transformation is performed on existing character
128		#' vectors which will be truncated, have newlines converted
129		#' to explicit symbols and will be wrapped in quotes if they
130		#' contain white space.
131		#'
132		#' @param x (`vector`) \cr vector to be converted to character
133		#' @param add_quotes (`logical`) \cr if true will wrap strings that contain
134		#' whitespace with quotes
135		#' @param crop_at (`numeric`) \cr specifies the limit at which strings should
136		#' be truncated to
137		#' @param ... additional arguments (not currently used)
138		#'
139		#' @name as_fmt_char
140		#' @keywords internal
141		as_fmt_char <- function(x, ...) {
142	180x	UseMethod("as_fmt_char")
143		}
144
145		#' @rdname as_fmt_char
146		#' @export
147		as_fmt_char.numeric <- function(x, ...) {
148	34x	format(x, digits = 7, justify = "right")
149		}
150
151		#' @rdname as_fmt_char
152		#' @export
153		as_fmt_char.NULL <- function(x, ...) {
154	!	"<NULL>"
155		}
156
157		#' @importFrom utils capture.output
158		#' @rdname as_fmt_char
159		#' @export
160		as_fmt_char.list <- function(x, ...) {
161	15x	vapply(
162	15x	x,
163	15x	function(x) {
164	54x	if (is.numeric(x)) {
165	!	return(as_fmt_char(x))
166		}
167	54x	if (is.character(x) & length(x) == 1) {
168	54x	return(as_fmt_char(x))
169		}
170	!	as_fmt_char(
171	!	paste(capture.output(dput(x)), collapse = " "),
172	!	add_quotes = FALSE
173		)
174		},
175	15x	character(1)
176		)
177		}
178
179		#' @rdname as_fmt_char
180		#' @export
181		as_fmt_char.factor <- function(x, ...) {
182	3x	as_fmt_char(as.character(x))
183		}
184
185		#' @rdname as_fmt_char
186		#' @export
187		as_fmt_char.character <- function(x, add_quotes = TRUE, crop_at = 30, ...) {
188	123x	needs_quotes <- grepl("\\s", x) & add_quotes
189
190	123x	x[is.na(x)] <- "<NA>"
191
192		# Replace \nl \cr with tags to stop print message splitting over
193		# multiple lines
194	123x	x <- gsub("\x0D", "<cr>", x)
195	123x	x <- gsub("\x0A", "<nl>", x)
196
197	123x	charlength <- vapply(x, nchar, numeric(1))
198	123x	x <- substr(x, 1, crop_at)
199	123x	x[charlength > crop_at] <- paste0(x[charlength > crop_at], "...")
200
201		# Add enclosing " " around strings with white space so that it can be
202		# clearly identified in the printed output
203	123x	x[needs_quotes] <- paste0('"', x[needs_quotes], '"')
204
205	123x	return(x)
206		}
207
208
209		#' @rdname as_fmt_char
210		#' @export
211		as_fmt_char.default <- function(x, ...) {
212	6x	x_char <- as_character(x)
213	6x	assertthat::assert_that(
214	6x	is.character(x_char),
215	6x	msg = sprintf(
216	6x	"Unable to convert class `'%s'` to character for printing purposes",
217	6x	paste(class(x), collapse = "', '")
218		)
219		)
220	5x	as_fmt_char.character(x_char, add_quotes = FALSE)
221		}
222
223
224		#' @rdname as_fmt_char
225		#' @export
226		as_fmt_char.POSIXt <- function(x, ...) {
227	4x	format(x, "%Y-%m-%d %H:%M:%S %Z")
228		}
229
230
231		#' get_table
232		#'
233		#' Generate nice looking table from a data frame
234		#' @param dsin dataset
235		#' @inheritParams print.diffdf
236		#' @keywords internal
237		get_table <- function(dsin, row_limit = 10) {
238	37x	if (nrow(dsin) == 0) {
239	!	return("")
240		}
241	37x	if (!is.null(row_limit)) {
242	34x	assertthat::assert_that(
243	34x	assertthat::is.number(row_limit),
244	34x	row_limit > 0,
245	34x	msg = "row_limit must be a positive integer"
246		)
247		}
248	37x	if (is.null(row_limit)) {
249	3x	display_table <- dsin
250		} else {
251	34x	display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1))
252		}
253
254	37x	add_message <- if (!is.null(row_limit) && nrow(dsin) > row_limit) {
255	5x	paste0(
256	5x	"First ",
257	5x	row_limit,
258	5x	" of ",
259	5x	nrow(dsin),
260	5x	" rows are shown in table below"
261		)
262		} else {
263	32x	NULL
264		}
265
266	37x	msg <- paste(
267	37x	c(
268	37x	add_message,
269	37x	as_ascii_table(display_table)
270		),
271	37x	collapse = "\n"
272		)
273	37x	return(msg)
274		}

1		#' Print diffdf objects
2		#'
3		#' Print nicely formatted version of an diffdf object
4		#' @param x comparison object created by diffdf().
5		#' @param ... Additional arguments (not used)
6		#' @param row_limit Max row limit for difference tables (NULL to show all rows)
7		#' @param as_string Return printed message as an R character vector?
8		#' @examples
9		#' x <- subset(iris, -Species)
10		#' x[1, 2] <- 5
11		#' COMPARE <- diffdf(iris, x)
12		#' print(COMPARE)
13		#' print(COMPARE, row_limit = 5)
14		#' @export
15		print.diffdf <- function(x, row_limit = 10, as_string = FALSE, ...) {
16	25x	if (!is.null(row_limit)) {
17	24x	assertthat::assert_that(
18	24x	assertthat::is.number(row_limit),
19	24x	row_limit > 0,
20	24x	msg = "row_limit must be a positive integer"
21		)
22		}
23	21x	assertthat::assert_that(
24	21x	assertthat::is.flag(as_string)
25		)
26	19x	COMPARE <- x
27
28	19x	if (length(COMPARE) == 0) {
29	6x	outtext <- "No issues were found!\n"
30		} else {
31	13x	start_text <- paste0("Differences found between the objects!\n\n")
32	13x	end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit))
33	13x	end_text <- paste0(unlist(end_text), collapse = "")
34	13x	outtext <- paste0(start_text, end_text)
35		}
36	19x	if (as_string) {
37	7x	return(strsplit(outtext, "\n")[[1]])
38		} else {
39	12x	cat(outtext)
40	12x	return(invisible(COMPARE))
41		}
42		}

1		#' Generate unique key name
2		#'
3		#' Function to generate a name for the keys if not provided
4		#'
5		#' @param BASE base dataset
6		#' @param COMP comparison dataset
7		#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
8		#'
9		#' @keywords internal
10		generate_keyname <- function(
11		BASE,
12		COMP,
13		replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
14		) {
15	193x	assertthat::assert_that(
16	193x	is(replace_names, "character"),
17	193x	msg = "replace_names is not a character vector"
18		)
19
20	193x	assertthat::assert_that(
21	193x	length(replace_names) != 0,
22	193x	msg = "All default row names are in use in BASE/COMPARE. Please provide a KEY argument"
23		)
24
25
26	185x	key_name <- replace_names[1]
27
28	185x	if (!is.null(BASE[[key_name]]) \|\| !is.null(COMP[[key_name]])) {
29	66x	key_name <- generate_keyname(BASE, COMP, replace_names[-1])
30		}
31	153x	return(key_name)
32		}