diffdf coverage - 91.35%

Files
Source

#' diffdf
#' @description
#' Compares 2 dataframes and outputs any differences.
#' @param base input dataframe
#' @param compare comparison dataframe
#' @param keys vector of variables (as strings) that defines a unique row in
#' the base and compare dataframes
#' @param strict_numeric Flag for strict numeric to numeric comparisons
#' (default = TRUE). If False diffdf will cast integer to double where
#' required for comparisons. Note that variables specified in the keys
#' will never be casted.
#' @param strict_factor Flag for strict factor to character comparisons
#' (default = TRUE). If False diffdf will cast factors to characters where
#' required for comparisons. Note that variables specified in the keys will
#' never be casted.
#' @param suppress_warnings Do you want to suppress warnings? (logical)
#' @param file Location and name of a text file to output the results to.
#' Setting to NULL will cause no file to be produced.
#' @param tolerance Set tolerance for numeric comparisons. Note that
#' comparisons fail if (x-y)/scale > tolerance.
#' @param scale Set scale for numeric comparisons. Note that comparisons fail
#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
#' version of scale = 1.
#' @examples
#' x <- subset( iris,  -Species)
#' x[1,2] <- 5
#' COMPARE <- diffdf( iris, x)
#' print( COMPARE )
#'
#' #### Sample data frames
#'
#' DF1 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , 3 , NA)
#' )
#'
#' DF2 <- data.frame(
#'     id = c(1,2,3,4,5,7),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , NA , NA),
#'     v3 = c(NA , NA , 1 , 2 , NA , 4)
#' )
#'
#' diffdf(DF1 , DF1 , keys = "id")
#'
#' # We can control matching with scale/location for example:
#'
#' DF1 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(1,2,3,4,5,6)
#' )
#' DF2 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(1.1,2,3,4,5,6)
#' )
#'
#' diffdf(DF1 , DF2 , keys = "id")
#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2)
#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2)
#'
#' # We can use strict_factor to compare factors with characters for example:
#'
#' DF1 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , 3 , NA),
#'     stringsAsFactors = FALSE
#' )
#'
#' DF2 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , 3 , NA)
#' )
#'
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE)
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE)
#'
#' @export
diffdf <- function(
    base,
    compare,
    keys = NULL,
    suppress_warnings = FALSE,
    strict_numeric = TRUE,
    strict_factor = TRUE,
    file = NULL,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {

    BASE <- base
    COMP <- compare
    KEYS <- keys
    SUPWARN <- suppress_warnings


    ### Initatiate output object
    COMPARE <- list()
    class(COMPARE) <- c("diffdf", "list")

    is_derived <- FALSE

    ### If no key is suplied match values based upon row number
    if (is.null(KEYS)) {
        is_derived <- TRUE
        keyname <- generate_keyname(BASE, COMP)
        BASE[[keyname]] <- seq_len(nrow(BASE))
        COMP[[keyname]] <- seq_len(nrow(COMP))
        KEYS  <- keyname
    }
    attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)



    if (!is.numeric(tolerance)) {
        stop("'tolerance' should be numeric")
    }

    if (!is.numeric(scale) && !is.null(scale)) {
        stop("'scale' should be numeric or NULL")
    }



    if (!has_unique_rows(BASE, KEYS)) {
        stop("BY variables in BASE do not result in unique observations")
    }

    if (!has_unique_rows(COMP, KEYS)) {
        stop("BY variables in COMPARE do not result in unique observations")
    }



    #### Check essential variable properties (class & mode)

    COMPARE[["UnsupportedColsBase"]] <- construct_issue(
        value = identify_unsupported_cols(BASE),
        message  = "There are columns in BASE with unsupported modes !!"
    )


    COMPARE[["UnsupportedColsComp"]] <- construct_issue(
        value = identify_unsupported_cols(COMP),
        message  = "There are columns in COMPARE with unsupported modes !!"
    )


    # cast variables if strict is off
    if (!strict_factor || !strict_numeric) {

        casted_df <- cast_variables(
            BASE = BASE,
            COMPARE = COMP,
            ignore_vars = KEYS,
            cast_integers = !strict_numeric,
            cast_factors = !strict_factor
        )

        BASE <- casted_df$BASE
        COMP <- casted_df$COMP

    }


    COMPARE[["VarModeDiffs"]] <- construct_issue(
        value = identify_mode_differences(BASE, COMP),
        message = "There are columns in BASE and COMPARE with different modes !!"
    )


    COMPARE[["VarClassDiffs"]] <- construct_issue(
        value = identify_class_differences(BASE, COMP),
        message = "There are columns in BASE and COMPARE with different classes !!"
    )


    exclude_cols <- c(
        COMPARE[["UnsupportedColsBase"]]$VARIABLE,
        COMPARE[["UnsupportedColsComp"]]$VARIABLE,
        COMPARE[["VarClassDiffs"]]$VARIABLE,
        COMPARE[["VarModeDiffs"]]$VARIABLE
    )


    ##### Check Validity of Keys

    BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
    COMP_keys <- names(COMP)[names(COMP) %in% KEYS]


    if (length(BASE_keys) != length(KEYS)) {
        stop("BASE is missing variables specified in KEYS")
    }

    if (length(COMP_keys) != length(KEYS)) {
        stop("COMP is missing variables specified in KEYS")
    }

    if (any(KEYS %in% exclude_cols)) {
        stop("KEYS are either an invalid or contain different modes between BASE and COMP")
    }


    ##### Check Attributes


    COMPARE[["AttribDiffs"]] <- construct_issue(
        value = identify_att_differences(BASE, COMP,  exclude_cols),
        message = "There are columns in BASE and COMPARE with differing attributes !!"
    )


    ##### Check data

    BASE <- factor_to_character(BASE, KEYS)
    COMP <- factor_to_character(COMP, KEYS)


    COMPARE[["ExtRowsBase"]] <- construct_issue(
        value = identify_extra_rows(BASE, COMP,   KEYS),
        message = "There are rows in BASE that are not in COMPARE !!"
    )


    COMPARE[["ExtRowsComp"]] <- construct_issue(
        value = identify_extra_rows(COMP, BASE, KEYS),
        message = "There are rows in COMPARE that are not in BASE !!"
    )



    COMPARE[["ExtColsBase"]] <- construct_issue(
        value = identify_extra_cols(BASE,  COMP),
        message = "There are columns in BASE that are not in COMPARE !!"
    )


    COMPARE[["ExtColsComp"]] <- construct_issue(
        value =  identify_extra_cols(COMP,  BASE),
        message = "There are columns in COMPARE that are not in BASE !!"
    )


    VALUE_DIFFERENCES <- identify_differences(
        BASE, COMP, KEYS, exclude_cols,
        tolerance = tolerance,
        scale = scale
    )



    ## Summarise the number of mismatching rows per variable

    if (length(VALUE_DIFFERENCES)) {
        NDIFF  <- sapply(VALUE_DIFFERENCES, nrow)
        COMPARE[["NumDiff"]] <- construct_issue(
            value = convert_to_issue(NDIFF),
            message = "Not all Values Compared Equal"
        )
    }


    for (i in names(VALUE_DIFFERENCES)){
        COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
            value = VALUE_DIFFERENCES[[i]],
            message = ""
        )
    }

    ## Get all issue messages, remove blank message, and collapse into single string
    ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x))
    ISSUE_MSGS <- ISSUE_MSGS[ISSUE_MSGS != ""]

    if (length(ISSUE_MSGS) != 0) {
        if (!SUPWARN) {
            ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
            warning(c("\n", ISSUE_MSGS))
        }
    }


    if (!is.null(file)) {
        x <- print(COMPARE, as_string = TRUE)

        tryCatch(
            {
                sink(file)
                cat(x, sep = "\n")
                sink()
            },
            warning = function(w) {
                sink()
                warning(w)
            },
            error = function(e) {
                sink()
                stop(e)
            }
        )
        return(invisible(COMPARE))
    }

    return(COMPARE)
}




#' diffdf_has_issues
#'
#' Utility function which returns TRUE if an diffdf
#' object has issues or FALSE if an diffdf object does not have issues
#' @param x diffdf object
#' @examples
#'
#' # Example with no issues
#' x <- diffdf( iris, iris )
#' diffdf_has_issues(x)
#'
#' # Example with issues
#' iris2 <- iris
#' iris2[2,2] <- NA
#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE)
#' diffdf_has_issues(x)
#' @export
diffdf_has_issues <- function(x) {
    if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
    return(length(x) != 0)
}



#' Pad String
#'
#' Utility function used to replicate `str_pad`. Adds white space to either end
#' of a string to get it to equal the desired length
#' @param x string
#' @param width desired length
string_pad <- function(x, width) {
    if (nchar(x) >= width) return(x)
    width <- width - nchar(x)
    left <- paste0(rep(" ", floor(width / 2)), collapse = "")
    right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
    paste0(left, x, right, collapse = "")
}


#' recursive_reduce
#'
#' Utility function used to replicated `purrr::reduce`. Recursively applies a
#' function to a list of elements until only 1 element remains
#' @param .l list of values to apply a function to
#' @param .f function to apply to each each element of the list in turn. See details.
#' @details
#' This function is essentially performing the following operation:
#' ```
#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
#' ```
recursive_reduce <- function(.l, .f) {
    if (length(.l) != 1) {
        .l[[2]] <- .f(.l[[1]], .l[[2]])
        return(recursive_reduce(.l[-1], .f))
    } else {
        return(.l[[1]])
    }
}

#' invert
#'
#' Utility function used to replicated `purrr::transpose`. Turns a list inside
#' out.
#' @param x list
invert <- function(x) {
    x2 <- list()
    cnames <- names(x)
    tnames <- names(x[[1]])
    for (i in tnames){
        x2[[i]] <- list()
        for (j in cnames){
            x2[[i]][[j]] <- x[[j]][[i]]
        }
    }
    return(x2)
}




#' as_ascii_table
#'
#' This function takes a `data.frame` and attempts to convert it into
#' a simple ascii format suitable for printing to the screen
#' It is assumed all variable values have a `as.character()` method
#' in order to cast them to character.
#' @param dat Input dataset to convert into a ascii table
#' @param line_prefix Symbols to prefix in front of every line of the table
as_ascii_table <- function(dat, line_prefix = "  ") {


    ## Convert every value to character and crop to a suitable length
    dat  <- as_tibble(apply(dat, c(1, 2), as_cropped_char))

    hold <- list()
    COLS <- colnames(dat)

    ### For each column extract core elements (width, values , title) and pad out
    ### each string to be a suitable length
    for (i in seq_len(ncol(dat))) {
        COL <- COLS[i]
        VALUES <- dat[[i]]

        JOINT <- c(COL, VALUES)
        WIDTH <- max(sapply(JOINT, nchar)) + 2

        hold[[COL]] <- list()
        hold[[COL]]$WIDTH <- WIDTH
        hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH)
        hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH)
    }

    ### Collapse into a single value per component ( title , values, width )
    thold  <- invert(hold)
    tvals  <- recursive_reduce(thold$VALUES, paste0)
    thead  <- recursive_reduce(thold$HEADER, paste0)
    twidth <- recursive_reduce(thold$WIDTH, sum)

    ### Create header and footer lines
    TLINE <- paste0(rep("=", twidth), collapse = "")
    LINE  <- paste0(rep("-", twidth), collapse = "")
    FVALS <- paste0(line_prefix, tvals, collapse = "\n")

    ### Output table
    paste0(
        "\n",
        line_prefix, TLINE, "\n",
        line_prefix, thead, "\n",
        line_prefix, LINE,  "\n",
        FVALS, "\n",
        line_prefix, LINE
    )
}








#' as_cropped_char
#'
#' Makes any character string above x chars
#' Reduce down to a x char string with ...
#' @param inval a single element value
#' @param crop_at character limit
as_cropped_char <- function(inval, crop_at = 30) {

    if (is.null(inval)) {
        inval <- "<NULL>"
    } else if (is.na(inval)) {
        inval <- "<NA>"
    } else {
        inval <- as.character(inval)
    }

    charlength <- sapply(inval, nchar)

    if (charlength > crop_at) {
        outval <- substr(inval, 1, crop_at)
        outval <- paste0(outval, "...")
    } else {
        outval <- inval
    }

    outval
}



#' get_table
#'
#' Generate nice looking table from a data frame
#' @param dsin dataset
#' @inheritParams print.diffdf
get_table <- function(dsin, row_limit = 10) {

    if (nrow(dsin) == 0) {
        return("")
    }
    if (!is.null(row_limit)) {
        if (length(row_limit) != 1) {
            stop("row_limit should have a length of 1")
        }
        if (!is.numeric(row_limit)) {
            stop("row_limit should be a numeric value or NULL")
        }
        if (row_limit <= 0) {
            stop("row_limit should be a positive integer")
        }
    }
    if (is.null(row_limit)) {
        display_table <- dsin
    } else {
        display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1))
    }

    if (!is.null(row_limit) && nrow(dsin) > row_limit) {
        add_message <- paste0(
            "First ",
            row_limit,
            " of ",
            nrow(dsin),
            " rows are shown in table below"
        )

    } else {
        add_message <- "All rows are shown in table below"
    }

    msg <- paste(
        c(
            add_message,
            as_ascii_table(display_table),
            "\n"
        ),
        collapse = "\n"
    )
    return(msg)
}


#' sort_then_join
#'
#' Convenience function to sort two strings and paste them together
#' @param string1 first string
#' @param string2 second string
sort_then_join <- function(string1, string2) {
    paste0(sort(c(string1, string2)), collapse = "")
}


#' class_merge
#'
#' Convenience function to put all classes an object has into one string
#' @param x an object
class_merge <- function(x) {
    paste(class(x), collapse = "_")
}


get_message <- function(colname, whichdat, totype) {
    message(paste0(
        "NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype
    ))
}


#' get_casted_vector
#'
#' casts a vector depending on its type and input
#' @param colin column to cast
#' @param colname name of vector
#' @param whichdat whether base or compare is being casted (used for messages)
get_casted_vector <- function(colin, colname,  whichdat) {

    if (class(colin) == "factor") {
        get_message(colname, whichdat, "character")
        return(as.character(colin))
    }

    if (class(colin) == "integer") {
        get_message(colname, whichdat, "numeric")
        return(as.numeric(colin))
    }

    colin
}



#' get_casted_dataset
#'
#' Internal utility function to loop across a dataset casting all target
#' variables
#' @param df dataset to be casted
#' @param columns columns to be casted
#' @param whichdat whether base or compare is being casted (used for messages)
get_casted_dataset <- function(df, columns, whichdat) {
    for (col in  columns) {
        df[[col]] <- get_casted_vector(df[[col]], col, whichdat)
    }
    return(df)
}






#' cast_variables
#'
#' Function to cast datasets columns if they have differing types
#' Restricted to specific cases, currently integer and double, and character and factor
#' @param BASE base dataset
#' @param COMPARE comparison dataset
#' @param ignore_vars Variables not to be considered for casting
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
cast_variables <- function(
    BASE,
    COMPARE,
    ignore_vars = NULL,
    cast_integers = FALSE,
    cast_factors = FALSE
) {

    allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]

    BASE_class <- data.frame(
        class_BASE = sapply(BASE, class_merge),
        colname = names(BASE),
        stringsAsFactors = FALSE
    )
    BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE]


    COMPARE_class <- data.frame(
        class_COMPARE = sapply(COMPARE, class_merge),
        colname = names(COMPARE),
        stringsAsFactors = FALSE
    )
    COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE]

    common_class <- merge(
        x = BASE_class,
        y = COMPARE_class,
        by = "colname"
    )


    diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE]


    diff_class$classmerge <- mapply(
        sort_then_join,
        diff_class$class_COMPARE,
        diff_class$class_BASE
    )


    cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE]


    DATASETS <- list(
        "BASE" = BASE,
        "COMPARE" = COMPARE
    )


    if (nrow(cast_columns) == 0) {
        return(DATASETS)
    }


    for (i in names(DATASETS)){
        DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i)
    }

    return(DATASETS)
}



#' identify_extra_rows
#'
#' Identifies rows that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
identify_extra_rows <- function(DS1, DS2, KEYS) {
    DS2[["..FLAG.."]] <- "Y"
    dat <- merge(
        subset(DS1, select = KEYS),
        subset(DS2, select = c(KEYS, "..FLAG..")),
        by = KEYS, all.x = TRUE,
        sort = TRUE
    )
    dat <- dat[do.call("order", dat[KEYS]), ]

    dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
}



#' identify_extra_cols
#'
#' Identifies columns that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @importFrom tibble tibble
identify_extra_cols <- function(DS1, DS2) {
    match.cols <- sapply(names(DS1), "%in%", names(DS2))
    if (!all(is.logical(match.cols))) {
        stop("Assumption of logical return type is not true")
    }
    tibble(
        COLUMNS = names(DS1)[!match.cols]
    )
}







#' identify_matching_cols
#'
#' Identifies columns with the same name in two data frames
#' @param DS1 Input dataset 1 (data frame)
#' @param DS2 Input dataset 2 (data frame)
#' @param EXCLUDE Columns to ignore
identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
    match_cols   <- sapply(names(DS1), "%in%", names(DS2))
    exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
    names(DS1)[match_cols & !exclude_cols]
}





#' identify_unsupported_cols
#'
#' Identifies any columns for which the package is not setup to handle
#' @param dsin input dataset
identify_unsupported_cols <- function(dsin) {
    dat <- subset(
        identify_properties(dsin),
        select = c("VARIABLE", "MODE")
    )

    dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]

}



#' identify_mode_differences
#'
#' Identifies any mode differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
identify_mode_differences <- function(BASE, COMP) {

    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        suffixes = c(".BASE", ".COMP"),
        sort = TRUE
    )
    dat <-  subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]

    dat[KEEP1 & KEEP2, , drop = FALSE]

}



#' identify_class_differences
#'
#' Identifies any class differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
identify_class_differences <- function(BASE, COMP) {

    matching_cols <- identify_matching_cols(BASE, COMP)

    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes =  c(".BASE", ".COMP")
    )

    dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))

    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- !mapply(
        identical,
        dat[["CLASS.BASE"]],
        dat[["CLASS.COMP"]]
    )

    dat[KEEP1 & KEEP2, , drop = FALSE]

}



#' Identify differences in attributes
#'
#' Identifies any attribute differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param exclude_cols Columns to exclude from comparison
#' @importFrom tibble tibble
identify_att_differences <- function(BASE, COMP, exclude_cols = "") {

    matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)

    PROPS <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )

    PROPS <- subset(PROPS, select =  c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))

    PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]


    ### Setup dummy return value
    RETURN <- tibble(
        VARIABLE = character(),
        ATTR_NAME = character(),
        VALUES.BASE = list(),
        VALUES.COMP = list()
    )

    for (i in  PROPS[["VARIABLE"]]) {

        PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]

        ### Get a vector of all available attributes across both variables
        ATTRIB_NAMES <- unique(c(
            names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
            names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
        ))

        ### If variable has no attributes move onto the next variable
        if (is.null(ATTRIB_NAMES)) next()

        ### Loop over each attribute checking if they are identical and outputing
        ### anyones that arn't
        for (j in ATTRIB_NAMES){

            ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
            ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]

            if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {

                ATT_DIFFS <- tibble(
                    VARIABLE = i,
                    ATTR_NAME = j,
                    VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
                    VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
                )

                RETURN <- rbind(RETURN, ATT_DIFFS)
            }
        }
    }
    return(RETURN)
}





#' identify_differences
#'
#' Compares each column within 2 datasets to identify any values which they
#' mismatch on.
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @param exclude_cols Columns to exclude from comparison
#' @param tolerance Level of tolerance for numeric differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
identify_differences <- function(
    BASE,
    COMP,
    KEYS,
    exclude_cols,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {

    matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))

    if (length(matching_cols) == 0) return(tibble())

    DAT <- merge(
        x = BASE,
        y = COMP,
        by = KEYS,
        suffix = c(".x", ".y"),
        sort = TRUE
    )
    DAT <- DAT[do.call("order", DAT[KEYS]), ]

    matching_list <- mapply(
        is_variable_different,
        matching_cols,
        MoreArgs = list(
            keynames = KEYS,
            datain = DAT,
            tolerance = tolerance,
            scale = scale
        ),
        SIMPLIFY = FALSE
    )

    matching_list
}









#' identify_properties
#'
#' Returns a dataframe of metadata for a given dataset.
#' Returned values include variable names , class , mode , type & attributes
#' @param dsin input dataframe that you want to get the metadata from
#' @importFrom tibble tibble
identify_properties <- function(dsin) {

    ### If missing or null return empty dataset
    if (is.null(dsin)) {
        x <- tibble(
            VARIABLE = character(),
            CLASS     = list(),
            MODE      = character(),
            TYPE      = character(),
            ATTRIBS   = list()
        )
        return(x)
    }

    tibble(
        VARIABLE = names(dsin),
        CLASS     = lapply(dsin, class),
        MODE      = sapply(dsin, mode),
        TYPE      = sapply(dsin, typeof),
        ATTRIBS   = lapply(dsin, attributes)
    )
}

#' is_variable_different
#'
#' This subsets the data set on the variable name, picks out differences and returns a `tibble`
#' of differences for the given variable
#' @importFrom tibble as_tibble
#' @param variablename name of variable being compared
#' @param keynames name of keys
#' @param datain Inputted dataset with base and compare vectors
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @return A boolean vector which is T if target and current are different
is_variable_different <- function(variablename, keynames, datain, ...) {

    xvar <- paste0(variablename, ".x")
    yvar <- paste0(variablename, ".y")

    if (!xvar %in% names(datain) || ! yvar %in% names(datain)) {
        stop("Variable does not exist within input dataset")
    }

    target  <- datain[[xvar]]
    current <- datain[[yvar]]
    outvect <- find_difference(target, current, ...)

    datain[["VARIABLE"]] <- variablename

    names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")

    x <- as_tibble(
        subset(
            datain,
            outvect,
            select = c("VARIABLE", keynames, "BASE", "COMPARE")
        )
    )

    return(x)
}

#' compare_vectors
#'
#' Compare two vectors looking for differences
#'
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
compare_vectors <- function(target, current, ...) {
    UseMethod("compare_vectors")
}


#' find_difference
#'
#' This determines if two vectors are different. It expects vectors of the same
#' length and type, and is intended to be used after checks have already been done
#' Initially picks out any `NA`'s (matching `NA`'s count as a match)
#' Then compares remaining vector
#'
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
find_difference <- function(target, current, ...) {

    if (length(target) != length(current)) {
        warning("Inputs are not of the same length")
        return(NULL)
    }

    if (is.null(target) || is.null(current)) {
        return(is.null(target) != is.null(current))
    }

    ### Initalise output, assume problem unless evidence otherwise
    return_vector <- rep(TRUE, length(target))

    nas_t <- is.na(target)
    nas_c <- is.na(current)

    ## compare missing values
    nacompare <- nas_t != nas_c
    naselect <- nas_t | nas_c
    return_vector[naselect]  <- nacompare[naselect]

    ## compare non-missing values
    selectvector <- as.logical((!nas_t) * (!nas_c))

    comparevect <- compare_vectors(
        target[selectvector],
        current[selectvector],
        ...
    )

    return_vector[selectvector] <- comparevect

    return(return_vector)
}







#' compare_vectors.default
#'
#' Default method, if the vector is not numeric or factor. Basic comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
compare_vectors.default <- function(target, current, ...) {
    target != current
}




#' compare_vectors.factor
#'
#' Compares factors. Sets them as character and then compares
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
compare_vectors.factor <- function(target, current, ...) {
    as.character(target) != as.character(current)
}





#' compare_vectors.numeric
#'
#' This is a modified version of the all.equal function
#' which returns a vector rather than a message
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
compare_vectors.numeric <- function(
    target,
    current,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {

    out <- target == current

    if (all(out)) {
        return(!out)
    }

    if (is.integer(target) || is.integer(current)) {
        target <- as.double(target)
        current <- as.double(current)
    }

    xy <- abs(target - current)

    if (!is.null(scale)) {
        xy <- xy / scale
    }

    return(xy > tolerance)
}

#' compare_vectors.int64
#'
#' Handle int64 vectors. Uses numeric comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
compare_vectors.integer64 <- function(
    target,
    current,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
) {
    compare_vectors.numeric(target, current, tolerance, scale)
}






#' Identify Issue Rows
#'
#' This function takes a `diffdf` object and a dataframe and subsets
#' the `data.frame` for problem rows as identified in the comparison object.
#' If \code{vars} has been specified only issue rows associated with those
#' variable(s) will be returned.
#' @param df dataframe to be subsetted
#' @param diff diffdf object
#' @param vars (optional) character vector containing names of issue variables to subset dataframe
#' on. A value of NULL (default) will be taken to mean available issue variables.
#' @examples
#' iris2 <- iris
#' for ( i in 1:3) iris2[i,i] <- 99
#' x <- diffdf( iris , iris2, suppress_warnings = TRUE)
#' diffdf_issuerows( iris , x)
#' diffdf_issuerows( iris2 , x)
#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length")
#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width"))
#' @details
#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only
#' requirement is that the original variables specified in the keys argument to diffdf
#' are present on the dataframe you are subsetting against. However please note that if
#' no keys were specified in diffdf then the row number is used. This means using
#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in
#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
#' @export
diffdf_issuerows <- function(df, diff, vars = NULL) {

    if (class(diff)[[1]] != "diffdf") {
        stop("diff should be an diffdf object")
    }

    KEYS_ATT <- attr(diff, "keys")

    if (is.null(KEYS_ATT)) {
        stop("diff is missing the keys attribute")
    }

    issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]

    if (is.null(vars)) {
        vars <- issue_vars
    }  else {
        vars <- paste0("VarDiff_", vars)
    }

    if (length(issue_vars) == 0 || sum(vars %in% issue_vars) == 0) {
        return(df[FALSE, ])
    }

    KEEP <- mapply(
        FUN      = get_issue_dataset,
        issue    = vars,
        diff     = list(diff),
        SIMPLIFY = FALSE
    )

    KEEP <- recursive_reduce(KEEP, rbind)
    KEEP <- KEEP[!duplicated(KEEP), ]

    if (KEYS_ATT$is_derived) {
        df[[KEYS_ATT$value]] <- seq_len(nrow(df))
    }

    keys <- KEYS_ATT$value

    if (any(! keys %in% names(df))) {
        stop("df does not contain all variables specified as keys in diff")
    }

    RET <- merge(
        x = df,
        y = KEEP,
        sort = TRUE
    )

    RET <- RET[do.call("order", RET[keys]), ]

    if (KEYS_ATT$is_derived) {
        keep_vars <- !names(RET) %in% KEYS_ATT$value
        RET <- RET[, keep_vars, drop = FALSE]
    }

    return(RET)
}




#' get_issue_dataset
#'
#' Internal function used by `diffdf_issuerows` to extract the dataframe
#' from each a target issue. In particular it also strips off any
#' non-key variables
#' @param issue name of issue to extract the dataset from diff
#' @param diff diffdf object which contains issues
get_issue_dataset <- function(issue, diff) {
    issue_df <- diff[[issue]]
    keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
    issue_df[, keep, drop = FALSE]
}



#' Generate unique key name
#'
#' Function to generate a name for the keys if not provided
#'
#' @param BASE base dataset
#' @param COMP comparison dataset
#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
#'
generate_keyname <- function(
    BASE,
    COMP,
    replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
) {
    if (class(replace_names) != "character") {
        stop("replace_names is not a character vector")
    }

    if (length(replace_names) == 0) {
        stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument")
    }

    key_name <- replace_names[1]

    if (!is.null(BASE[[key_name]]) || !is.null(COMP[[key_name]])) {
        key_name <- generate_keyname(BASE, COMP, replace_names[-1])
    }
    return(key_name)
}

#' construct_issue
#'
#' Make an s3 object with class issue and possible additional class,
#' and assign other arguments to attributes
#' @param value the value of the object
#' @param message the value of the message attribute
#' @param add_class additional class to add
construct_issue <- function(value, message, add_class = NULL) {
    x <- value

    ### If nothing has been provided return nothing !
    if (nrow(x) == 0) return(NULL)

    class(x) <- c(add_class, "issue", class(x))
    attributes(x)[["message"]] <- message
    return(x)
}


#' get_issue_message
#'
#' Simple function to grab the issue message
#' @param object inputted object of class issue
#' @param ... other arguments
get_issue_message <- function(object, ...) {
    return(attr(object, "message"))
}


#' get_print_message
#'
#' Get the required text depending on type of issue
#' @param object inputted object of class issue
#' @param ... other arguments
get_print_message <- function(object, ...) {
    UseMethod("get_print_message", object)
}


#' get_print_message.default
#'
#' Errors, as this should only ever be given an issue
#' @param object issue
get_print_message.default <- function(object) {
    stop("Error: An issue has not been provided to this function!")
}


#' get_print_message.issue
#'
#' Get text from a basic issue, based on the class of the value of the issue
#'
#' @param object an object of class issue_basic
#' @inheritParams print.diffdf
get_print_message.issue <- function(object, row_limit, ...) {
    paste(
        c(attr(object, "message"), get_table(object, row_limit = row_limit)),
        collapse = "\n"
    )
}


#' Print diffdf objects
#'
#' Print nicely formatted version of an diffdf object
#' @param x comparison object created by diffdf().
#' @param ... Additional arguments (not used)
#' @param row_limit Max row limit for difference tables (NULL to show all rows)
#' @param as_string Return printed message as an R character vector?
#' @examples
#' x <- subset( iris , -Species )
#' x[1,2] <- 5
#' COMPARE <- diffdf( iris, x)
#' print( COMPARE )
#' print( COMPARE, row_limit = 5 )
#' @export
print.diffdf <- function(x, row_limit = 10, as_string = FALSE, ...) {
    if (!is.null(row_limit)) {
        if (length(row_limit) != 1) {
            stop("row_limit should have a length of 1")
        }
        if (!is.numeric(row_limit)) {
            stop("row_limit should be a numeric value or NULL")
        }
        if (row_limit <= 0) {
            stop("row_limit should be a positive integer")
        }
    }
    if (!is.logical(as_string) || length(as_string) != 1) {
        stop("as_string should be a logical of length one")
    }
    COMPARE <- x

    if (length(COMPARE) == 0) {
        outtext <- "No issues were found!\n"
    } else {
        start_text <- paste0(
            "Differences found between the objects!\n\n",
            "A summary is given below.\n\n"
        )
        end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit))
        end_text <- paste0(unlist(end_text), collapse = "")
        outtext <- paste0(start_text, end_text)
    }
    if (as_string) {
        return(strsplit(outtext, "\n")[[1]])
    } else {
        cat(outtext)
        return(invisible(COMPARE))
    }
}




#' factor_to_character
#'
#' Takes a dataframe and converts any factor variables to character
#' @param dsin input dataframe
#' @param vars variables to consider for conversion. Default NULL will consider
#' every variable within the dataset
factor_to_character <- function(dsin, vars = NULL) {

    if (is.null(vars)) vars <- names(dsin)

    for (var in vars){
        if (is.factor(dsin[[var]])) {
            dsin[[var]] <- as.character(dsin[[var]])
        }
    }
    return(dsin)
}




#' has_unique_rows
#'
#' Check if a data sets rows are unique
#' @param DAT input data set (data frame)
#' @param KEYS Set of keys which should be unique
has_unique_rows <- function(DAT, KEYS) {
    DUPS <- duplicated(subset(DAT, select = KEYS))
    NDUPS <- sum(DUPS)
    return(NDUPS == 0)
}

#'convert_to_issue
#'
#'converts the count value into the correct issue format
#'@param datin data inputted
#'@importFrom tibble rownames_to_column
convert_to_issue <- function(datin) {
    datin_tibble <- tibble(
        `Variable` = names(datin),
        `No of Differences` = datin
    )

    datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
    return(datin_tibble_reduced)
}

1
2
3		#' Pad String
4		#'
5		#' Utility function used to replicate `str_pad`. Adds white space to either end
6		#' of a string to get it to equal the desired length
7		#' @param x string
8		#' @param width desired length
9		string_pad <- function(x, width) {
10	!	if (nchar(x) >= width) return(x)
11	1114x	width <- width - nchar(x)
12	1114x	left <- paste0(rep(" ", floor(width / 2)), collapse = "")
13	1114x	right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
14	1114x	paste0(left, x, right, collapse = "")
15		}
16
17
18		#' recursive_reduce
19		#'
20		#' Utility function used to replicated `purrr::reduce`. Recursively applies a
21		#' function to a list of elements until only 1 element remains
22		#' @param .l list of values to apply a function to
23		#' @param .f function to apply to each each element of the list in turn. See details.
24		#' @details
25		#' This function is essentially performing the following operation:
26		#' ```
27		#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
28		#' ```
29		recursive_reduce <- function(.l, .f) {
30	629x	if (length(.l) != 1) {
31	425x	.l[[2]] <- .f(.l[[1]], .l[[2]])
32	425x	return(recursive_reduce(.l[-1], .f))
33		} else {
34	204x	return(.l[[1]])
35		}
36		}
37
38		#' invert
39		#'
40		#' Utility function used to replicated `purrr::transpose`. Turns a list inside
41		#' out.
42		#' @param x list
43		invert <- function(x) {
44	64x	x2 <- list()
45	64x	cnames <- names(x)
46	64x	tnames <- names(x[[1]])
47	64x	for (i in tnames){
48	192x	x2[[i]] <- list()
49	192x	for (j in cnames){
50	609x	x2[[i]][[j]] <- x[[j]][[i]]
51		}
52		}
53	64x	return(x2)
54		}
55
56
57
58
59		#' as_ascii_table
60		#'
61		#' This function takes a `data.frame` and attempts to convert it into
62		#' a simple ascii format suitable for printing to the screen
63		#' It is assumed all variable values have a `as.character()` method
64		#' in order to cast them to character.
65		#' @param dat Input dataset to convert into a ascii table
66		#' @param line_prefix Symbols to prefix in front of every line of the table
67		as_ascii_table <- function(dat, line_prefix = " ") {
68
69
70		## Convert every value to character and crop to a suitable length
71	64x	dat <- as_tibble(apply(dat, c(1, 2), as_cropped_char))
72
73	64x	hold <- list()
74	64x	COLS <- colnames(dat)
75
76		### For each column extract core elements (width, values , title) and pad out
77		### each string to be a suitable length
78	64x	for (i in seq_len(ncol(dat))) {
79	203x	COL <- COLS[i]
80	203x	VALUES <- dat[[i]]
81
82	203x	JOINT <- c(COL, VALUES)
83	203x	WIDTH <- max(sapply(JOINT, nchar)) + 2
84
85	203x	hold[[COL]] <- list()
86	203x	hold[[COL]]$WIDTH <- WIDTH
87	203x	hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH)
88	203x	hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH)
89		}
90
91		### Collapse into a single value per component ( title , values, width )
92	64x	thold <- invert(hold)
93	64x	tvals <- recursive_reduce(thold$VALUES, paste0)
94	64x	thead <- recursive_reduce(thold$HEADER, paste0)
95	64x	twidth <- recursive_reduce(thold$WIDTH, sum)
96
97		### Create header and footer lines
98	64x	TLINE <- paste0(rep("=", twidth), collapse = "")
99	64x	LINE <- paste0(rep("-", twidth), collapse = "")
100	64x	FVALS <- paste0(line_prefix, tvals, collapse = "\n")
101
102		### Output table
103	64x	paste0(
104	64x	"\n",
105	64x	line_prefix, TLINE, "\n",
106	64x	line_prefix, thead, "\n",
107	64x	line_prefix, LINE, "\n",
108	64x	FVALS, "\n",
109	64x	line_prefix, LINE
110		)
111		}
112
113
114
115
116
117
118
119
120		#' as_cropped_char
121		#'
122		#' Makes any character string above x chars
123		#' Reduce down to a x char string with ...
124		#' @param inval a single element value
125		#' @param crop_at character limit
126		as_cropped_char <- function(inval, crop_at = 30) {
127
128	911x	if (is.null(inval)) {
129	!	inval <- "<NULL>"
130	911x	} else if (is.na(inval)) {
131	1x	inval <- "<NA>"
132		} else {
133	910x	inval <- as.character(inval)
134		}
135
136	911x	charlength <- sapply(inval, nchar)
137
138	911x	if (charlength > crop_at) {
139	10x	outval <- substr(inval, 1, crop_at)
140	10x	outval <- paste0(outval, "...")
141		} else {
142	901x	outval <- inval
143		}
144
145	911x	outval
146		}
147
148
149
150		#' get_table
151		#'
152		#' Generate nice looking table from a data frame
153		#' @param dsin dataset
154		#' @inheritParams print.diffdf
155		get_table <- function(dsin, row_limit = 10) {
156
157	64x	if (nrow(dsin) == 0) {
158	!	return("")
159		}
160	64x	if (!is.null(row_limit)) {
161	62x	if (length(row_limit) != 1) {
162	!	stop("row_limit should have a length of 1")
163		}
164	62x	if (!is.numeric(row_limit)) {
165	!	stop("row_limit should be a numeric value or NULL")
166		}
167	62x	if (row_limit <= 0) {
168	!	stop("row_limit should be a positive integer")
169		}
170		}
171	64x	if (is.null(row_limit)) {
172	2x	display_table <- dsin
173		} else {
174	62x	display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1))
175		}
176
177	64x	if (!is.null(row_limit) && nrow(dsin) > row_limit) {
178	6x	add_message <- paste0(
179	6x	"First ",
180	6x	row_limit,
181	6x	" of ",
182	6x	nrow(dsin),
183	6x	" rows are shown in table below"
184		)
185
186		} else {
187	58x	add_message <- "All rows are shown in table below"
188		}
189
190	64x	msg <- paste(
191	64x	c(
192	64x	add_message,
193	64x	as_ascii_table(display_table),
194	64x	"\n"
195		),
196	64x	collapse = "\n"
197		)
198	64x	return(msg)
199		}

1
2		#' sort_then_join
3		#'
4		#' Convenience function to sort two strings and paste them together
5		#' @param string1 first string
6		#' @param string2 second string
7		sort_then_join <- function(string1, string2) {
8	14x	paste0(sort(c(string1, string2)), collapse = "")
9		}
10
11
12		#' class_merge
13		#'
14		#' Convenience function to put all classes an object has into one string
15		#' @param x an object
16		class_merge <- function(x) {
17	304x	paste(class(x), collapse = "_")
18		}
19
20
21		get_message <- function(colname, whichdat, totype) {
22	12x	message(paste0(
23	12x	"NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype
24		))
25		}
26
27
28		#' get_casted_vector
29		#'
30		#' casts a vector depending on its type and input
31		#' @param colin column to cast
32		#' @param colname name of vector
33		#' @param whichdat whether base or compare is being casted (used for messages)
34		get_casted_vector <- function(colin, colname, whichdat) {
35
36	24x	if (class(colin) == "factor") {
37	6x	get_message(colname, whichdat, "character")
38	6x	return(as.character(colin))
39		}
40
41	18x	if (class(colin) == "integer") {
42	6x	get_message(colname, whichdat, "numeric")
43	6x	return(as.numeric(colin))
44		}
45
46	12x	colin
47		}
48
49
50
51		#' get_casted_dataset
52		#'
53		#' Internal utility function to loop across a dataset casting all target
54		#' variables
55		#' @param df dataset to be casted
56		#' @param columns columns to be casted
57		#' @param whichdat whether base or compare is being casted (used for messages)
58		get_casted_dataset <- function(df, columns, whichdat) {
59	22x	for (col in columns) {
60	24x	df[[col]] <- get_casted_vector(df[[col]], col, whichdat)
61		}
62	22x	return(df)
63		}
64
65
66
67
68
69
70		#' cast_variables
71		#'
72		#' Function to cast datasets columns if they have differing types
73		#' Restricted to specific cases, currently integer and double, and character and factor
74		#' @param BASE base dataset
75		#' @param COMPARE comparison dataset
76		#' @param ignore_vars Variables not to be considered for casting
77		#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
78		#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
79		cast_variables <- function(
80		BASE,
81		COMPARE,
82		ignore_vars = NULL,
83		cast_integers = FALSE,
84		cast_factors = FALSE
85		) {
86
87	13x	allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]
88
89	13x	BASE_class <- data.frame(
90	13x	class_BASE = sapply(BASE, class_merge),
91	13x	colname = names(BASE),
92	13x	stringsAsFactors = FALSE
93		)
94	13x	BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE]
95
96
97	13x	COMPARE_class <- data.frame(
98	13x	class_COMPARE = sapply(COMPARE, class_merge),
99	13x	colname = names(COMPARE),
100	13x	stringsAsFactors = FALSE
101		)
102	13x	COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE]
103
104	13x	common_class <- merge(
105	13x	x = BASE_class,
106	13x	y = COMPARE_class,
107	13x	by = "colname"
108		)
109
110
111	13x	diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE]
112
113
114	13x	diff_class$classmerge <- mapply(
115	13x	sort_then_join,
116	13x	diff_class$class_COMPARE,
117	13x	diff_class$class_BASE
118		)
119
120
121	13x	cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE]
122
123
124	13x	DATASETS <- list(
125	13x	"BASE" = BASE,
126	13x	"COMPARE" = COMPARE
127		)
128
129
130	13x	if (nrow(cast_columns) == 0) {
131	2x	return(DATASETS)
132		}
133
134
135	11x	for (i in names(DATASETS)){
136	22x	DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i)
137		}
138
139	11x	return(DATASETS)
140		}

1		#' is_variable_different
2		#'
3		#' This subsets the data set on the variable name, picks out differences and returns a `tibble`
4		#' of differences for the given variable
5		#' @importFrom tibble as_tibble
6		#' @param variablename name of variable being compared
7		#' @param keynames name of keys
8		#' @param datain Inputted dataset with base and compare vectors
9		#' @param ... Additional arguments which might be passed through (numerical accuracy)
10		#' @return A boolean vector which is T if target and current are different
11		is_variable_different <- function(variablename, keynames, datain, ...) {
12
13	993x	xvar <- paste0(variablename, ".x")
14	993x	yvar <- paste0(variablename, ".y")
15
16	993x	if (!xvar %in% names(datain) \|\| ! yvar %in% names(datain)) {
17	!	stop("Variable does not exist within input dataset")
18		}
19
20	993x	target <- datain[[xvar]]
21	993x	current <- datain[[yvar]]
22	993x	outvect <- find_difference(target, current, ...)
23
24	993x	datain[["VARIABLE"]] <- variablename
25
26	993x	names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
27
28	993x	x <- as_tibble(
29	993x	subset(
30	993x	datain,
31	993x	outvect,
32	993x	select = c("VARIABLE", keynames, "BASE", "COMPARE")
33		)
34		)
35
36	993x	return(x)
37		}
38
39		#' compare_vectors
40		#'
41		#' Compare two vectors looking for differences
42		#'
43		#' @param target the base vector
44		#' @param current a vector to compare target to
45		#' @param ... Additional arguments which might be passed through (numerical accuracy)
46		compare_vectors <- function(target, current, ...) {
47	1026x	UseMethod("compare_vectors")
48		}
49
50
51		#' find_difference
52		#'
53		#' This determines if two vectors are different. It expects vectors of the same
54		#' length and type, and is intended to be used after checks have already been done
55		#' Initially picks out any `NA`'s (matching `NA`'s count as a match)
56		#' Then compares remaining vector
57		#'
58		#' @param target the base vector
59		#' @param current a vector to compare target to
60		#' @param ... Additional arguments which might be passed through (numerical accuracy)
61		find_difference <- function(target, current, ...) {
62
63	1029x	if (length(target) != length(current)) {
64	2x	warning("Inputs are not of the same length")
65	2x	return(NULL)
66		}
67
68	1027x	if (is.null(target) \|\| is.null(current)) {
69	1x	return(is.null(target) != is.null(current))
70		}
71
72		### Initalise output, assume problem unless evidence otherwise
73	1026x	return_vector <- rep(TRUE, length(target))
74
75	1026x	nas_t <- is.na(target)
76	1026x	nas_c <- is.na(current)
77
78		## compare missing values
79	1026x	nacompare <- nas_t != nas_c
80	1026x	naselect <- nas_t \| nas_c
81	1026x	return_vector[naselect] <- nacompare[naselect]
82
83		## compare non-missing values
84	1026x	selectvector <- as.logical((!nas_t) * (!nas_c))
85
86	1026x	comparevect <- compare_vectors(
87	1026x	target[selectvector],
88	1026x	current[selectvector],
89		...
90		)
91
92	1026x	return_vector[selectvector] <- comparevect
93
94	1026x	return(return_vector)
95		}
96
97
98
99
100
101
102
103		#' compare_vectors.default
104		#'
105		#' Default method, if the vector is not numeric or factor. Basic comparison
106		#' @param target the base vector
107		#' @param current a vector to compare target to
108		#' @param ... Additional arguments which might be passed through (numerical accuracy)
109		compare_vectors.default <- function(target, current, ...) {
110	445x	target != current
111		}
112
113
114
115
116		#' compare_vectors.factor
117		#'
118		#' Compares factors. Sets them as character and then compares
119		#' @param target the base vector
120		#' @param current a vector to compare target to
121		#' @param ... Additional arguments which might be passed through (numerical accuracy)
122		compare_vectors.factor <- function(target, current, ...) {
123	90x	as.character(target) != as.character(current)
124		}
125
126
127
128
129
130		#' compare_vectors.numeric
131		#'
132		#' This is a modified version of the all.equal function
133		#' which returns a vector rather than a message
134		#' @param target the base vector
135		#' @param current a vector to compare target to
136		#' @param tolerance Level of tolerance for differences between two variables
137		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
138		compare_vectors.numeric <- function(
139		target,
140		current,
141		tolerance = sqrt(.Machine$double.eps),
142		scale = NULL
143		) {
144
145	491x	out <- target == current
146
147	491x	if (all(out)) {
148	451x	return(!out)
149		}
150
151	40x	if (is.integer(target) \|\| is.integer(current)) {
152	13x	target <- as.double(target)
153	13x	current <- as.double(current)
154		}
155
156	40x	xy <- abs(target - current)
157
158	40x	if (!is.null(scale)) {
159	4x	xy <- xy / scale
160		}
161
162	40x	return(xy > tolerance)
163		}
164
165		#' compare_vectors.int64
166		#'
167		#' Handle int64 vectors. Uses numeric comparison
168		#' @param target the base vector
169		#' @param current a vector to compare target to
170		#' @param tolerance Level of tolerance for differences between two variables
171		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
172		compare_vectors.integer64 <- function(
173		target,
174		current,
175		tolerance = sqrt(.Machine$double.eps),
176		scale = NULL
177		) {
178	4x	compare_vectors.numeric(target, current, tolerance, scale)
179		}

1
2
3
4
5
6		#' Identify Issue Rows
7		#'
8		#' This function takes a `diffdf` object and a dataframe and subsets
9		#' the `data.frame` for problem rows as identified in the comparison object.
10		#' If \code{vars} has been specified only issue rows associated with those
11		#' variable(s) will be returned.
12		#' @param df dataframe to be subsetted
13		#' @param diff diffdf object
14		#' @param vars (optional) character vector containing names of issue variables to subset dataframe
15		#' on. A value of NULL (default) will be taken to mean available issue variables.
16		#' @examples
17		#' iris2 <- iris
18		#' for ( i in 1:3) iris2[i,i] <- 99
19		#' x <- diffdf( iris , iris2, suppress_warnings = TRUE)
20		#' diffdf_issuerows( iris , x)
21		#' diffdf_issuerows( iris2 , x)
22		#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length")
23		#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width"))
24		#' @details
25		#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only
26		#' requirement is that the original variables specified in the keys argument to diffdf
27		#' are present on the dataframe you are subsetting against. However please note that if
28		#' no keys were specified in diffdf then the row number is used. This means using
29		#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in
30		#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
31		#' @export
32		diffdf_issuerows <- function(df, diff, vars = NULL) {
33
34	19x	if (class(diff)[[1]] != "diffdf") {
35	!	stop("diff should be an diffdf object")
36		}
37
38	19x	KEYS_ATT <- attr(diff, "keys")
39
40	19x	if (is.null(KEYS_ATT)) {
41	!	stop("diff is missing the keys attribute")
42		}
43
44	19x	issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]
45
46	19x	if (is.null(vars)) {
47	15x	vars <- issue_vars
48		} else {
49	4x	vars <- paste0("VarDiff_", vars)
50		}
51
52	19x	if (length(issue_vars) == 0 \|\| sum(vars %in% issue_vars) == 0) {
53	7x	return(df[FALSE, ])
54		}
55
56	12x	KEEP <- mapply(
57	12x	FUN = get_issue_dataset,
58	12x	issue = vars,
59	12x	diff = list(diff),
60	12x	SIMPLIFY = FALSE
61		)
62
63	12x	KEEP <- recursive_reduce(KEEP, rbind)
64	12x	KEEP <- KEEP[!duplicated(KEEP), ]
65
66	12x	if (KEYS_ATT$is_derived) {
67	2x	df[[KEYS_ATT$value]] <- seq_len(nrow(df))
68		}
69
70	12x	keys <- KEYS_ATT$value
71
72	12x	if (any(! keys %in% names(df))) {
73	1x	stop("df does not contain all variables specified as keys in diff")
74		}
75
76	11x	RET <- merge(
77	11x	x = df,
78	11x	y = KEEP,
79	11x	sort = TRUE
80		)
81
82	11x	RET <- RET[do.call("order", RET[keys]), ]
83
84	11x	if (KEYS_ATT$is_derived) {
85	2x	keep_vars <- !names(RET) %in% KEYS_ATT$value
86	2x	RET <- RET[, keep_vars, drop = FALSE]
87		}
88
89	11x	return(RET)
90		}
91
92
93
94
95		#' get_issue_dataset
96		#'
97		#' Internal function used by `diffdf_issuerows` to extract the dataframe
98		#' from each a target issue. In particular it also strips off any
99		#' non-key variables
100		#' @param issue name of issue to extract the dataset from diff
101		#' @param diff diffdf object which contains issues
102		get_issue_dataset <- function(issue, diff) {
103	20x	issue_df <- diff[[issue]]
104	20x	keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
105	20x	issue_df[, keep, drop = FALSE]
106		}

1
2
3		#' Generate unique key name
4		#'
5		#' Function to generate a name for the keys if not provided
6		#'
7		#' @param BASE base dataset
8		#' @param COMP comparison dataset
9		#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
10		#'
11		generate_keyname <- function(
12		BASE,
13		COMP,
14		replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
15		) {
16	213x	if (class(replace_names) != "character") {
17	!	stop("replace_names is not a character vector")
18		}
19
20	213x	if (length(replace_names) == 0) {
21	8x	stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument")
22		}
23
24	205x	key_name <- replace_names[1]
25
26	205x	if (!is.null(BASE[[key_name]]) \|\| !is.null(COMP[[key_name]])) {
27	66x	key_name <- generate_keyname(BASE, COMP, replace_names[-1])
28		}
29	173x	return(key_name)
30		}

1		#' diffdf
2		#' @description
3		#' Compares 2 dataframes and outputs any differences.
4		#' @param base input dataframe
5		#' @param compare comparison dataframe
6		#' @param keys vector of variables (as strings) that defines a unique row in
7		#' the base and compare dataframes
8		#' @param strict_numeric Flag for strict numeric to numeric comparisons
9		#' (default = TRUE). If False diffdf will cast integer to double where
10		#' required for comparisons. Note that variables specified in the keys
11		#' will never be casted.
12		#' @param strict_factor Flag for strict factor to character comparisons
13		#' (default = TRUE). If False diffdf will cast factors to characters where
14		#' required for comparisons. Note that variables specified in the keys will
15		#' never be casted.
16		#' @param suppress_warnings Do you want to suppress warnings? (logical)
17		#' @param file Location and name of a text file to output the results to.
18		#' Setting to NULL will cause no file to be produced.
19		#' @param tolerance Set tolerance for numeric comparisons. Note that
20		#' comparisons fail if (x-y)/scale > tolerance.
21		#' @param scale Set scale for numeric comparisons. Note that comparisons fail
22		#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient
23		#' version of scale = 1.
24		#' @examples
25		#' x <- subset( iris, -Species)
26		#' x[1,2] <- 5
27		#' COMPARE <- diffdf( iris, x)
28		#' print( COMPARE )
29		#'
30		#' #### Sample data frames
31		#'
32		#' DF1 <- data.frame(
33		#' id = c(1,2,3,4,5,6),
34		#' v1 = letters[1:6],
35		#' v2 = c(NA , NA , 1 , 2 , 3 , NA)
36		#' )
37		#'
38		#' DF2 <- data.frame(
39		#' id = c(1,2,3,4,5,7),
40		#' v1 = letters[1:6],
41		#' v2 = c(NA , NA , 1 , 2 , NA , NA),
42		#' v3 = c(NA , NA , 1 , 2 , NA , 4)
43		#' )
44		#'
45		#' diffdf(DF1 , DF1 , keys = "id")
46		#'
47		#' # We can control matching with scale/location for example:
48		#'
49		#' DF1 <- data.frame(
50		#' id = c(1,2,3,4,5,6),
51		#' v1 = letters[1:6],
52		#' v2 = c(1,2,3,4,5,6)
53		#' )
54		#' DF2 <- data.frame(
55		#' id = c(1,2,3,4,5,6),
56		#' v1 = letters[1:6],
57		#' v2 = c(1.1,2,3,4,5,6)
58		#' )
59		#'
60		#' diffdf(DF1 , DF2 , keys = "id")
61		#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2)
62		#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2)
63		#'
64		#' # We can use strict_factor to compare factors with characters for example:
65		#'
66		#' DF1 <- data.frame(
67		#' id = c(1,2,3,4,5,6),
68		#' v1 = letters[1:6],
69		#' v2 = c(NA , NA , 1 , 2 , 3 , NA),
70		#' stringsAsFactors = FALSE
71		#' )
72		#'
73		#' DF2 <- data.frame(
74		#' id = c(1,2,3,4,5,6),
75		#' v1 = letters[1:6],
76		#' v2 = c(NA , NA , 1 , 2 , 3 , NA)
77		#' )
78		#'
79		#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE)
80		#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE)
81		#'
82		#' @export
83		diffdf <- function(
84		base,
85		compare,
86		keys = NULL,
87		suppress_warnings = FALSE,
88		strict_numeric = TRUE,
89		strict_factor = TRUE,
90		file = NULL,
91		tolerance = sqrt(.Machine$double.eps),
92		scale = NULL
93		) {
94
95	136x	BASE <- base
96	136x	COMP <- compare
97	136x	KEYS <- keys
98	136x	SUPWARN <- suppress_warnings
99
100
101		### Initatiate output object
102	136x	COMPARE <- list()
103	136x	class(COMPARE) <- c("diffdf", "list")
104
105	136x	is_derived <- FALSE
106
107		### If no key is suplied match values based upon row number
108	136x	if (is.null(KEYS)) {
109	122x	is_derived <- TRUE
110	122x	keyname <- generate_keyname(BASE, COMP)
111	122x	BASE[[keyname]] <- seq_len(nrow(BASE))
112	122x	COMP[[keyname]] <- seq_len(nrow(COMP))
113	122x	KEYS <- keyname
114		}
115	136x	attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived)
116
117
118
119	136x	if (!is.numeric(tolerance)) {
120	2x	stop("'tolerance' should be numeric")
121		}
122
123	134x	if (!is.numeric(scale) && !is.null(scale)) {
124	2x	stop("'scale' should be numeric or NULL")
125		}
126
127
128
129	132x	if (!has_unique_rows(BASE, KEYS)) {
130	1x	stop("BY variables in BASE do not result in unique observations")
131		}
132
133	131x	if (!has_unique_rows(COMP, KEYS)) {
134	!	stop("BY variables in COMPARE do not result in unique observations")
135		}
136
137
138
139		#### Check essential variable properties (class & mode)
140
141	131x	COMPARE[["UnsupportedColsBase"]] <- construct_issue(
142	131x	value = identify_unsupported_cols(BASE),
143	131x	message = "There are columns in BASE with unsupported modes !!"
144		)
145
146
147	131x	COMPARE[["UnsupportedColsComp"]] <- construct_issue(
148	131x	value = identify_unsupported_cols(COMP),
149	131x	message = "There are columns in COMPARE with unsupported modes !!"
150		)
151
152
153		# cast variables if strict is off
154	131x	if (!strict_factor \|\| !strict_numeric) {
155
156	9x	casted_df <- cast_variables(
157	9x	BASE = BASE,
158	9x	COMPARE = COMP,
159	9x	ignore_vars = KEYS,
160	9x	cast_integers = !strict_numeric,
161	9x	cast_factors = !strict_factor
162		)
163
164	9x	BASE <- casted_df$BASE
165	9x	COMP <- casted_df$COMP
166
167		}
168
169
170	131x	COMPARE[["VarModeDiffs"]] <- construct_issue(
171	131x	value = identify_mode_differences(BASE, COMP),
172	131x	message = "There are columns in BASE and COMPARE with different modes !!"
173		)
174
175
176	131x	COMPARE[["VarClassDiffs"]] <- construct_issue(
177	131x	value = identify_class_differences(BASE, COMP),
178	131x	message = "There are columns in BASE and COMPARE with different classes !!"
179		)
180
181
182	131x	exclude_cols <- c(
183	131x	COMPARE[["UnsupportedColsBase"]]$VARIABLE,
184	131x	COMPARE[["UnsupportedColsComp"]]$VARIABLE,
185	131x	COMPARE[["VarClassDiffs"]]$VARIABLE,
186	131x	COMPARE[["VarModeDiffs"]]$VARIABLE
187		)
188
189
190		##### Check Validity of Keys
191
192	131x	BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
193	131x	COMP_keys <- names(COMP)[names(COMP) %in% KEYS]
194
195
196	131x	if (length(BASE_keys) != length(KEYS)) {
197	!	stop("BASE is missing variables specified in KEYS")
198		}
199
200	131x	if (length(COMP_keys) != length(KEYS)) {
201	!	stop("COMP is missing variables specified in KEYS")
202		}
203
204	131x	if (any(KEYS %in% exclude_cols)) {
205	!	stop("KEYS are either an invalid or contain different modes between BASE and COMP")
206		}
207
208
209		##### Check Attributes
210
211
212	131x	COMPARE[["AttribDiffs"]] <- construct_issue(
213	131x	value = identify_att_differences(BASE, COMP, exclude_cols),
214	131x	message = "There are columns in BASE and COMPARE with differing attributes !!"
215		)
216
217
218		##### Check data
219
220	131x	BASE <- factor_to_character(BASE, KEYS)
221	131x	COMP <- factor_to_character(COMP, KEYS)
222
223
224	131x	COMPARE[["ExtRowsBase"]] <- construct_issue(
225	131x	value = identify_extra_rows(BASE, COMP, KEYS),
226	131x	message = "There are rows in BASE that are not in COMPARE !!"
227		)
228
229
230	131x	COMPARE[["ExtRowsComp"]] <- construct_issue(
231	131x	value = identify_extra_rows(COMP, BASE, KEYS),
232	131x	message = "There are rows in COMPARE that are not in BASE !!"
233		)
234
235
236
237	131x	COMPARE[["ExtColsBase"]] <- construct_issue(
238	131x	value = identify_extra_cols(BASE, COMP),
239	131x	message = "There are columns in BASE that are not in COMPARE !!"
240		)
241
242
243	131x	COMPARE[["ExtColsComp"]] <- construct_issue(
244	131x	value = identify_extra_cols(COMP, BASE),
245	131x	message = "There are columns in COMPARE that are not in BASE !!"
246		)
247
248
249	131x	VALUE_DIFFERENCES <- identify_differences(
250	131x	BASE, COMP, KEYS, exclude_cols,
251	131x	tolerance = tolerance,
252	131x	scale = scale
253		)
254
255
256
257		## Summarise the number of mismatching rows per variable
258
259	131x	if (length(VALUE_DIFFERENCES)) {
260	130x	NDIFF <- sapply(VALUE_DIFFERENCES, nrow)
261	130x	COMPARE[["NumDiff"]] <- construct_issue(
262	130x	value = convert_to_issue(NDIFF),
263	130x	message = "Not all Values Compared Equal"
264		)
265		}
266
267
268	131x	for (i in names(VALUE_DIFFERENCES)){
269	993x	COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
270	993x	value = VALUE_DIFFERENCES[[i]],
271	993x	message = ""
272		)
273		}
274
275		## Get all issue messages, remove blank message, and collapse into single string
276	131x	ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x))
277	131x	ISSUE_MSGS <- ISSUE_MSGS[ISSUE_MSGS != ""]
278
279	131x	if (length(ISSUE_MSGS) != 0) {
280	97x	if (!SUPWARN) {
281	36x	ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n")
282	36x	warning(c("\n", ISSUE_MSGS))
283		}
284		}
285
286
287	131x	if (!is.null(file)) {
288	!	x <- print(COMPARE, as_string = TRUE)
289
290	!	tryCatch(
291		{
292	!	sink(file)
293	!	cat(x, sep = "\n")
294	!	sink()
295		},
296	!	warning = function(w) {
297	!	sink()
298	!	warning(w)
299		},
300	!	error = function(e) {
301	!	sink()
302	!	stop(e)
303		}
304		)
305	!	return(invisible(COMPARE))
306		}
307
308	131x	return(COMPARE)
309		}
310
311
312
313
314		#' diffdf_has_issues
315		#'
316		#' Utility function which returns TRUE if an diffdf
317		#' object has issues or FALSE if an diffdf object does not have issues
318		#' @param x diffdf object
319		#' @examples
320		#'
321		#' # Example with no issues
322		#' x <- diffdf( iris, iris )
323		#' diffdf_has_issues(x)
324		#'
325		#' # Example with issues
326		#' iris2 <- iris
327		#' iris2[2,2] <- NA
328		#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE)
329		#' diffdf_has_issues(x)
330		#' @export
331		diffdf_has_issues <- function(x) {
332	!	if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
333	21x	return(length(x) != 0)
334		}

1
2
3		#' identify_extra_rows
4		#'
5		#' Identifies rows that are in a baseline dataset but not in a comparator dataset
6		#' @param DS1 Baseline dataset (data frame)
7		#' @param DS2 Comparator dataset (data frame)
8		#' @param KEYS List of variables that define a unique row within the datasets (strings)
9		identify_extra_rows <- function(DS1, DS2, KEYS) {
10	262x	DS2[["..FLAG.."]] <- "Y"
11	262x	dat <- merge(
12	262x	subset(DS1, select = KEYS),
13	262x	subset(DS2, select = c(KEYS, "..FLAG..")),
14	262x	by = KEYS, all.x = TRUE,
15	262x	sort = TRUE
16		)
17	262x	dat <- dat[do.call("order", dat[KEYS]), ]
18
19	262x	dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE]
20		}
21
22
23
24		#' identify_extra_cols
25		#'
26		#' Identifies columns that are in a baseline dataset but not in a comparator dataset
27		#' @param DS1 Baseline dataset (data frame)
28		#' @param DS2 Comparator dataset (data frame)
29		#' @importFrom tibble tibble
30		identify_extra_cols <- function(DS1, DS2) {
31	262x	match.cols <- sapply(names(DS1), "%in%", names(DS2))
32	262x	if (!all(is.logical(match.cols))) {
33	!	stop("Assumption of logical return type is not true")
34		}
35	262x	tibble(
36	262x	COLUMNS = names(DS1)[!match.cols]
37		)
38		}
39
40
41
42
43
44
45
46		#' identify_matching_cols
47		#'
48		#' Identifies columns with the same name in two data frames
49		#' @param DS1 Input dataset 1 (data frame)
50		#' @param DS2 Input dataset 2 (data frame)
51		#' @param EXCLUDE Columns to ignore
52		identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
53	834x	match_cols <- sapply(names(DS1), "%in%", names(DS2))
54	834x	exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE)
55	834x	names(DS1)[match_cols & !exclude_cols]
56		}
57
58
59
60
61
62		#' identify_unsupported_cols
63		#'
64		#' Identifies any columns for which the package is not setup to handle
65		#' @param dsin input dataset
66		identify_unsupported_cols <- function(dsin) {
67	262x	dat <- subset(
68	262x	identify_properties(dsin),
69	262x	select = c("VARIABLE", "MODE")
70		)
71
72	262x	dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]
73
74		}
75
76
77
78		#' identify_mode_differences
79		#'
80		#' Identifies any mode differences between two data frames
81		#' @param BASE Base dataset for comparison (data.frame)
82		#' @param COMP Comparator dataset to compare base against (data.frame)
83		identify_mode_differences <- function(BASE, COMP) {
84
85	431x	matching_cols <- identify_matching_cols(BASE, COMP)
86
87	431x	dat <- merge(
88	431x	x = identify_properties(BASE),
89	431x	y = identify_properties(COMP),
90	431x	by = "VARIABLE",
91	431x	all = TRUE,
92	431x	suffixes = c(".BASE", ".COMP"),
93	431x	sort = TRUE
94		)
95	431x	dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))
96
97	431x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
98	431x	KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]
99
100	431x	dat[KEEP1 & KEEP2, , drop = FALSE]
101
102		}
103
104
105
106		#' identify_class_differences
107		#'
108		#' Identifies any class differences between two data frames
109		#' @param BASE Base dataset for comparison (data.frame)
110		#' @param COMP Comparator dataset to compare base against (data.frame)
111		identify_class_differences <- function(BASE, COMP) {
112
113	131x	matching_cols <- identify_matching_cols(BASE, COMP)
114
115	131x	dat <- merge(
116	131x	x = identify_properties(BASE),
117	131x	y = identify_properties(COMP),
118	131x	by = "VARIABLE",
119	131x	all = TRUE,
120	131x	sort = TRUE,
121	131x	suffixes = c(".BASE", ".COMP")
122		)
123
124	131x	dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))
125
126	131x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
127	131x	KEEP2 <- !mapply(
128	131x	identical,
129	131x	dat[["CLASS.BASE"]],
130	131x	dat[["CLASS.COMP"]]
131		)
132
133	131x	dat[KEEP1 & KEEP2, , drop = FALSE]
134
135		}
136
137
138
139		#' Identify differences in attributes
140		#'
141		#' Identifies any attribute differences between two data frames
142		#' @param BASE Base dataset for comparison (data.frame)
143		#' @param COMP Comparator dataset to compare base against (data.frame)
144		#' @param exclude_cols Columns to exclude from comparison
145		#' @importFrom tibble tibble
146		identify_att_differences <- function(BASE, COMP, exclude_cols = "") {
147
148	141x	matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols)
149
150	141x	PROPS <- merge(
151	141x	x = identify_properties(BASE),
152	141x	y = identify_properties(COMP),
153	141x	by = "VARIABLE",
154	141x	all = TRUE,
155	141x	sort = TRUE,
156	141x	suffixes = c(".BASE", ".COMP")
157		)
158
159	141x	PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))
160
161	141x	PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE]
162
163
164		### Setup dummy return value
165	141x	RETURN <- tibble(
166	141x	VARIABLE = character(),
167	141x	ATTR_NAME = character(),
168	141x	VALUES.BASE = list(),
169	141x	VALUES.COMP = list()
170		)
171
172	141x	for (i in PROPS[["VARIABLE"]]) {
173
174	1241x	PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE]
175
176		### Get a vector of all available attributes across both variables
177	1241x	ATTRIB_NAMES <- unique(c(
178	1241x	names(PROPS_filt[["ATTRIBS.BASE"]][[1]]),
179	1241x	names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
180		))
181
182		### If variable has no attributes move onto the next variable
183	908x	if (is.null(ATTRIB_NAMES)) next()
184
185		### Loop over each attribute checking if they are identical and outputing
186		### anyones that arn't
187	333x	for (j in ATTRIB_NAMES){
188
189	574x	ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
190	574x	ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j]
191
192	574x	if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {
193
194	89x	ATT_DIFFS <- tibble(
195	89x	VARIABLE = i,
196	89x	ATTR_NAME = j,
197	89x	VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE),
198	89x	VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP)
199		)
200
201	89x	RETURN <- rbind(RETURN, ATT_DIFFS)
202		}
203		}
204		}
205	141x	return(RETURN)
206		}
207
208
209
210
211
212		#' identify_differences
213		#'
214		#' Compares each column within 2 datasets to identify any values which they
215		#' mismatch on.
216		#' @param BASE Base dataset for comparison (data.frame)
217		#' @param COMP Comparator dataset to compare base against (data.frame)
218		#' @param KEYS List of variables that define a unique row within the datasets (strings)
219		#' @param exclude_cols Columns to exclude from comparison
220		#' @param tolerance Level of tolerance for numeric differences between two variables
221		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
222		identify_differences <- function(
223		BASE,
224		COMP,
225		KEYS,
226		exclude_cols,
227		tolerance = sqrt(.Machine$double.eps),
228		scale = NULL
229		) {
230
231	131x	matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols))
232
233	1x	if (length(matching_cols) == 0) return(tibble())
234
235	130x	DAT <- merge(
236	130x	x = BASE,
237	130x	y = COMP,
238	130x	by = KEYS,
239	130x	suffix = c(".x", ".y"),
240	130x	sort = TRUE
241		)
242	130x	DAT <- DAT[do.call("order", DAT[KEYS]), ]
243
244	130x	matching_list <- mapply(
245	130x	is_variable_different,
246	130x	matching_cols,
247	130x	MoreArgs = list(
248	130x	keynames = KEYS,
249	130x	datain = DAT,
250	130x	tolerance = tolerance,
251	130x	scale = scale
252		),
253	130x	SIMPLIFY = FALSE
254		)
255
256	130x	matching_list
257		}
258
259
260
261
262
263
264
265
266
267		#' identify_properties
268		#'
269		#' Returns a dataframe of metadata for a given dataset.
270		#' Returned values include variable names , class , mode , type & attributes
271		#' @param dsin input dataframe that you want to get the metadata from
272		#' @importFrom tibble tibble
273		identify_properties <- function(dsin) {
274
275		### If missing or null return empty dataset
276	1668x	if (is.null(dsin)) {
277	!	x <- tibble(
278	!	VARIABLE = character(),
279	!	CLASS = list(),
280	!	MODE = character(),
281	!	TYPE = character(),
282	!	ATTRIBS = list()
283		)
284	!	return(x)
285		}
286
287	1668x	tibble(
288	1668x	VARIABLE = names(dsin),
289	1668x	CLASS = lapply(dsin, class),
290	1668x	MODE = sapply(dsin, mode),
291	1668x	TYPE = sapply(dsin, typeof),
292	1668x	ATTRIBS = lapply(dsin, attributes)
293		)
294		}

1		#' construct_issue
2		#'
3		#' Make an s3 object with class issue and possible additional class,
4		#' and assign other arguments to attributes
5		#' @param value the value of the object
6		#' @param message the value of the message attribute
7		#' @param add_class additional class to add
8		construct_issue <- function(value, message, add_class = NULL) {
9	2302x	x <- value
10
11		### If nothing has been provided return nothing !
12	2092x	if (nrow(x) == 0) return(NULL)
13
14	210x	class(x) <- c(add_class, "issue", class(x))
15	210x	attributes(x)[["message"]] <- message
16	210x	return(x)
17		}
18
19
20		#' get_issue_message
21		#'
22		#' Simple function to grab the issue message
23		#' @param object inputted object of class issue
24		#' @param ... other arguments
25		get_issue_message <- function(object, ...) {
26	210x	return(attr(object, "message"))
27		}
28
29
30		#' get_print_message
31		#'
32		#' Get the required text depending on type of issue
33		#' @param object inputted object of class issue
34		#' @param ... other arguments
35		get_print_message <- function(object, ...) {
36	64x	UseMethod("get_print_message", object)
37		}
38
39
40		#' get_print_message.default
41		#'
42		#' Errors, as this should only ever be given an issue
43		#' @param object issue
44		get_print_message.default <- function(object) {
45	!	stop("Error: An issue has not been provided to this function!")
46		}
47
48
49		#' get_print_message.issue
50		#'
51		#' Get text from a basic issue, based on the class of the value of the issue
52		#'
53		#' @param object an object of class issue_basic
54		#' @inheritParams print.diffdf
55		get_print_message.issue <- function(object, row_limit, ...) {
56	64x	paste(
57	64x	c(attr(object, "message"), get_table(object, row_limit = row_limit)),
58	64x	collapse = "\n"
59		)
60		}

1
2		#' Print diffdf objects
3		#'
4		#' Print nicely formatted version of an diffdf object
5		#' @param x comparison object created by diffdf().
6		#' @param ... Additional arguments (not used)
7		#' @param row_limit Max row limit for difference tables (NULL to show all rows)
8		#' @param as_string Return printed message as an R character vector?
9		#' @examples
10		#' x <- subset( iris , -Species )
11		#' x[1,2] <- 5
12		#' COMPARE <- diffdf( iris, x)
13		#' print( COMPARE )
14		#' print( COMPARE, row_limit = 5 )
15		#' @export
16		print.diffdf <- function(x, row_limit = 10, as_string = FALSE, ...) {
17	31x	if (!is.null(row_limit)) {
18	30x	if (length(row_limit) != 1) {
19	1x	stop("row_limit should have a length of 1")
20		}
21	29x	if (!is.numeric(row_limit)) {
22	2x	stop("row_limit should be a numeric value or NULL")
23		}
24	27x	if (row_limit <= 0) {
25	1x	stop("row_limit should be a positive integer")
26		}
27		}
28	27x	if (!is.logical(as_string) \|\| length(as_string) != 1) {
29	2x	stop("as_string should be a logical of length one")
30		}
31	25x	COMPARE <- x
32
33	25x	if (length(COMPARE) == 0) {
34	2x	outtext <- "No issues were found!\n"
35		} else {
36	23x	start_text <- paste0(
37	23x	"Differences found between the objects!\n\n",
38	23x	"A summary is given below.\n\n"
39		)
40	23x	end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit))
41	23x	end_text <- paste0(unlist(end_text), collapse = "")
42	23x	outtext <- paste0(start_text, end_text)
43		}
44	25x	if (as_string) {
45	25x	return(strsplit(outtext, "\n")[[1]])
46		} else {
47	!	cat(outtext)
48	!	return(invisible(COMPARE))
49		}
50		}

1
2
3
4		#' factor_to_character
5		#'
6		#' Takes a dataframe and converts any factor variables to character
7		#' @param dsin input dataframe
8		#' @param vars variables to consider for conversion. Default NULL will consider
9		#' every variable within the dataset
10		factor_to_character <- function(dsin, vars = NULL) {
11
12	!	if (is.null(vars)) vars <- names(dsin)
13
14	262x	for (var in vars){
15	276x	if (is.factor(dsin[[var]])) {
16	!	dsin[[var]] <- as.character(dsin[[var]])
17		}
18		}
19	262x	return(dsin)
20		}
21
22
23
24
25		#' has_unique_rows
26		#'
27		#' Check if a data sets rows are unique
28		#' @param DAT input data set (data frame)
29		#' @param KEYS Set of keys which should be unique
30		has_unique_rows <- function(DAT, KEYS) {
31	263x	DUPS <- duplicated(subset(DAT, select = KEYS))
32	263x	NDUPS <- sum(DUPS)
33	263x	return(NDUPS == 0)
34		}
35
36		#'convert_to_issue
37		#'
38		#'converts the count value into the correct issue format
39		#'@param datin data inputted
40		#'@importFrom tibble rownames_to_column
41		convert_to_issue <- function(datin) {
42	130x	datin_tibble <- tibble(
43	130x	`Variable` = names(datin),
44	130x	`No of Differences` = datin
45		)
46
47	130x	datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
48	130x	return(datin_tibble_reduced)
49		}