diffdf coverage - 91.62%

Files
Source



#' identify_extra_rows
#' 
#' Identifies rows that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
identify_extra_rows <- function(DS1, DS2 , KEYS){
    DS2[["..FLAG.."]] <- "Y"
    dat <- merge(
        subset( DS1 , select = KEYS) , 
        subset( DS2 , select = c(KEYS, "..FLAG..")) , 
        by = KEYS, all.x = T,
        sort = TRUE
    )
    dat <- dat[do.call("order", dat[KEYS]), ]
    
    dat[ is.na(dat[["..FLAG.."]]) , KEYS, drop=FALSE]
}



#' identify_extra_cols
#' 
#' Identifies columns that are in a baseline dataset but not in a comparator dataset
#' @param DS1 Baseline dataset (data frame)
#' @param DS2 Comparator dataset (data frame)
#' @importFrom tibble tibble
identify_extra_cols <- function(DS1 , DS2){
    match.cols <- sapply ( names(DS1), "%in%", names(DS2))
    if (  !all(is.logical(match.cols)) ){
        stop("Assumption of logical return type is not true")
    }
    tibble(
        COLUMNS = names(DS1)[ !match.cols]
    )
}







#' identify_matching_cols
#' 
#' Identifies columns with the same name in two data frames
#' @param DS1 Input dataset 1 (data frame)
#' @param DS2 Input dataset 2 (data frame)
#' @param EXCLUDE Columns to ignore
identify_matching_cols <- function(DS1, DS2 , EXCLUDE = ""){
    match_cols   <- sapply( names(DS1), "%in%" , names(DS2))
    exclude_cols <- sapply( names(DS1), "%in%" , EXCLUDE)
    names(DS1)[ match_cols & !exclude_cols ]
}





#' identify_unsupported_cols
#' 
#' Identifies any columns for which the package is not setup to handle
#' @param dsin input dataset
identify_unsupported_cols <- function(dsin){
    dat <- subset(
        identify_properties(dsin) ,
        select = c("VARIABLE", "MODE")
    ) 
    
    dat[ !dat[["MODE"]] %in% c('numeric', 'character', 'logical') ,, drop=FALSE]

}



#' identify_mode_differences
#' 
#' Identifies any mode differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
identify_mode_differences <- function( BASE, COMP ){

    matching_cols <- identify_matching_cols( BASE , COMP  )
    
    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE,
        suffixes = c(".BASE", ".COMP"),
        sort = TRUE
    ) 
    dat <-  subset( dat, select = c("VARIABLE" , "MODE.BASE" , "MODE.COMP"))
    
    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]
    
    dat[ KEEP1 & KEEP2 ,, drop=FALSE]

}



#' identify_class_differences
#' 
#' Identifies any class differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
identify_class_differences <- function( BASE, COMP ){
    
    matching_cols <- identify_matching_cols( BASE , COMP )
    
    dat <- merge(
        x = identify_properties(BASE),
        y = identify_properties(COMP),
        by = "VARIABLE",
        all = TRUE, 
        sort = TRUE,
        suffixes =  c(".BASE", ".COMP")
    ) 
    
    dat <- subset( dat , select = c("VARIABLE" , "CLASS.BASE" , "CLASS.COMP")) 
    
    KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
    KEEP2 <- !mapply( 
        identical,
        dat[["CLASS.BASE"]] , 
        dat[["CLASS.COMP"]] 
    )
    
    dat[ KEEP1 & KEEP2 ,, drop=FALSE] 

}



#' identify_att_differences
#' 
#' Identifies any attribute differences between two data frames
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param exclude_cols Columns to exclude from comparison
#' @importFrom tibble tibble
identify_att_differences <- function( BASE, COMP , exclude_cols = "" ){
    
    matching_cols <- identify_matching_cols( BASE , COMP , exclude_cols )
    
    PROPS <- merge(
        x = identify_properties(BASE) ,
        y = identify_properties(COMP) , 
        by = "VARIABLE",  
        all = TRUE,
        sort = TRUE,
        suffixes = c(".BASE", ".COMP")
    )
    
    PROPS <- subset( PROPS , select =  c("VARIABLE", "ATTRIBS.BASE" , "ATTRIBS.COMP")) 
    
    PROPS <- PROPS[ PROPS[["VARIABLE"]] %in% matching_cols,, drop = FALSE]
 
    
    ### Setup dummy return value
    RETURN <- tibble(
        VARIABLE = character(),
        ATTR_NAME = character(),
        VALUES.BASE = list(),
        VALUES.COMP = list()
    )
    
    for ( i in  PROPS[["VARIABLE"]] ){
        
        PROPS_filt <- PROPS[ PROPS[["VARIABLE"]] == i ,, drop=FALSE]

        ### Get a vector of all available attributes across both variables
        ATTRIB_NAMES = unique(c( 
            names(PROPS_filt[["ATTRIBS.BASE"]][[1]]) , 
            names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
        ))
        
        ### If variable has no attributes move onto the next variable
        if ( is.null(ATTRIB_NAMES) ) next()
        
        ### Loop over each attribute checking if they are identical and outputing
        ### anyones that arn't
        for ( j in ATTRIB_NAMES){
            
            ATTRIB_BASE = PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
            ATTRIB_COMP = PROPS_filt[["ATTRIBS.COMP"]][[1]][j]
            
            if ( !identical(ATTRIB_BASE , ATTRIB_COMP) ){
                
                ATT_DIFFS <- tibble(
                    VARIABLE = i , 
                    ATTR_NAME = j , 
                    VALUES.BASE = ifelse( is.null(ATTRIB_BASE) , list() , ATTRIB_BASE),  
                    VALUES.COMP = ifelse( is.null(ATTRIB_COMP) , list() , ATTRIB_COMP)
                ) 
                
                RETURN <- rbind(RETURN , ATT_DIFFS)
            }
        }
    }
    return(RETURN)
}





#' identify_differences
#' 
#' Compares each column within 2 datasets to identify any values which they 
#' mismatch on.
#' @param BASE Base dataset for comparison (data.frame)
#' @param COMP Comparator dataset to compare base against (data.frame)
#' @param KEYS List of variables that define a unique row within the datasets (strings)
#' @param exclude_cols Columns to exclude from comparison
#' @param tolerance Level of tolerance for numeric differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
identify_differences <- function( BASE , COMP , KEYS, exclude_cols,  
                                  tolerance = sqrt(.Machine$double.eps),
                                  scale = NULL ) {
    
    matching_cols <- identify_matching_cols( BASE , COMP , c(KEYS, exclude_cols))
    
    if( length(matching_cols) == 0  ) return ( tibble() )
  
    DAT = merge( 
        x = BASE , 
        y = COMP , 
        by = KEYS , 
        suffix = c(".x", ".y"),
        sort = TRUE
    )
    DAT <- DAT[do.call("order", DAT[KEYS]), ]
    

    matching_list <- mapply(
        is_variable_different , 
        matching_cols,
        MoreArgs = list(
            keynames = KEYS, 
            datain = DAT, 
            tolerance = tolerance ,
            scale = scale
        ),
        SIMPLIFY = FALSE
    )
    
    matching_list
}









#' identify_properties
#' 
#' Returns a dataframe of metadata for a given dataset.
#' Returned values include variable names , class , mode , type & attributes
#' @param dsin input dataframe that you want to get the metadata from
#' @importFrom tibble tibble
identify_properties <- function(dsin){
    
    ### If missing or null return empty dataset
    if( is.null(dsin) ) {
        x <- tibble(
            VARIABLE = character(),
            CLASS     = list(),
            MODE      = character(),
            TYPE      = character() ,
            ATTRIBS   = list()
        )
        return(x)
    }
    
    tibble(
        VARIABLE = names(dsin),
        CLASS     = lapply(dsin, class),
        MODE      = sapply(dsin , mode),
        TYPE      = sapply(dsin , typeof) ,
        ATTRIBS   = lapply( dsin , attributes)
    )
}







#' diffdf
#' @description  
#' Compares 2 dataframes and outputs any differences.
#' @param base input dataframe
#' @param compare comparison dataframe
#' @param keys vector of variables (as strings) that defines a unique row in the base and compare dataframes
#' @param strict_numeric Flag for strict numeric to numeric comparisons (default = TRUE). If False diffdf will cast integer to double where required for comparisons. Note that variables specified in the keys will never be casted.
#' @param strict_factor Flag for strict factor to character comparisons (default = TRUE). If False diffdf will cast factors to characters where required for comparisons. Note that variables specified in the keys will never be casted.
#' @param suppress_warnings Do you want to suppress warnings? (logical)
#' @param file Location and name of a text file to output the results to. Setting to NULL will cause no file to be produced.
#' @param tolerance Set tolerance for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance.
#' @param scale Set scale for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient version of scale = 1. 
#' @examples
#' x <- subset( iris,  -Species)
#' x[1,2] <- 5
#' COMPARE <- diffdf( iris, x)
#' print( COMPARE )
#' print( COMPARE , "Sepal.Length" )
#' 
#' #### Sample data frames
#' 
#' DF1 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , 3 , NA)
#' )
#' 
#' DF2 <- data.frame(
#'     id = c(1,2,3,4,5,7),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , NA , NA),
#'     v3 = c(NA , NA , 1 , 2 , NA , 4)
#' )
#' 
#' diffdf(DF1 , DF1 , keys = "id")
#' 
#' # We can control matching with scale/location for example:
#' 
#' DF1 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(1,2,3,4,5,6)
#' )
#' DF2 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(1.1,2,3,4,5,6)
#' )
#' 
#' diffdf(DF1 , DF2 , keys = "id")
#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2)
#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2)
#'  
#' # We can use strict_factor to compare factors with characters for example:
#' 
#' DF1 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , 3 , NA), 
#'     stringsAsFactors = FALSE
#' )
#' 
#' DF2 <- data.frame(
#'     id = c(1,2,3,4,5,6),
#'     v1 = letters[1:6],
#'     v2 = c(NA , NA , 1 , 2 , 3 , NA)
#' )
#' 
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE)
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE)
#'  
#' @export
diffdf <- function (
    base , 
    compare , 
    keys = NULL, 
    suppress_warnings = FALSE, 
    strict_numeric = TRUE,
    strict_factor = TRUE,
    file = NULL,
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
){
    
    BASE = base
    COMP = compare
    KEYS = keys
    SUPWARN = suppress_warnings
    
    
    ### Initatiate output object
    COMPARE <- list()
    class(COMPARE) <- c("diffdf" , "list") 
    
    is_derived <- FALSE
    
    ### If no key is suplied match values based upon row number
    if (is.null(KEYS)){
        is_derived <- TRUE
        keyname <- generate_keyname(BASE, COMP)
        BASE[[keyname]] <-  1:nrow(BASE) 
        COMP[[keyname]] <-  1:nrow(COMP) 
        KEYS  <- keyname
    }
    attr(COMPARE, 'keys') <- list(value = KEYS, is_derived = is_derived)
    
    
    
    if (!is.numeric(tolerance)) {
        stop("'tolerance' should be numeric")
    }
    
    if (!is.numeric(scale) && !is.null(scale)) {
        stop("'scale' should be numeric or NULL")
    }
    

    
    if ( !has_unique_rows(BASE , KEYS) ){
        stop( "BY variables in BASE do not result in unique observations")
    }
    
    if ( !has_unique_rows(COMP , KEYS) ){
        stop( "BY variables in COMPARE do not result in unique observations")
    }
    
    

    #### Check essential variable properties (class & mode)
    
    COMPARE[["UnsupportedColsBase"]] <- construct_issue(
        value = identify_unsupported_cols(BASE) , 
        message  = "There are columns in BASE with unsupported modes !!" 
    )
    
    
    COMPARE[["UnsupportedColsComp"]] <- construct_issue(
        value = identify_unsupported_cols(COMP) , 
        message  = "There are columns in COMPARE with unsupported modes !!" 
    )

    
    # cast variables if strict is off
    if ( !strict_factor | !strict_numeric ){
        
        casted_df <- cast_variables( 
            BASE = BASE, 
            COMPARE = COMP, 
            ignore_vars = KEYS, 
            cast_integers = !strict_numeric , 
            cast_factors = !strict_factor
        )
        
        BASE <- casted_df$BASE
        COMP <- casted_df$COMP
        
    }
    
    
    COMPARE[["VarModeDiffs"]] <- construct_issue(
        value = identify_mode_differences( BASE, COMP ) ,
        message = "There are columns in BASE and COMPARE with different modes !!"
    )
    
    
    COMPARE[["VarClassDiffs"]] <- construct_issue(
        value = identify_class_differences(BASE, COMP) ,
        message = "There are columns in BASE and COMPARE with different classes !!"
    )
    

    exclude_cols <- c(
        COMPARE[["UnsupportedColsBase"]]$VARIABLE , 
        COMPARE[["UnsupportedColsComp"]]$VARIABLE,
        COMPARE[["VarClassDiffs"]]$VARIABLE,
        COMPARE[["VarModeDiffs"]]$VARIABLE
    )
    

    ##### Check Validity of Keys
    
    BASE_keys <- names(BASE)[names(BASE) %in% KEYS] 
    COMP_keys <- names(COMP)[names(COMP) %in% KEYS] 
    
    
    if ( length(BASE_keys) != length(KEYS)  ){
        stop( "BASE is missing variables specified in KEYS")
    }
    
    if ( length(COMP_keys) != length(KEYS) ){
        stop( "COMP is missing variables specified in KEYS")
    }
    
    if( any(KEYS %in% exclude_cols)){
        stop("KEYS are either an invalid or contain different modes between BASE and COMP")
    }
    

    ##### Check Attributes

    
    COMPARE[["AttribDiffs"]] <- construct_issue(
        value = identify_att_differences(BASE,  COMP ,  exclude_cols)  ,
        message = "There are columns in BASE and COMPARE with differing attributes !!"
    )
    
    
    ##### Check data
    
    BASE <- factor_to_character(BASE , KEYS)
    COMP <- factor_to_character(COMP , KEYS)
    
    
    COMPARE[["ExtRowsBase"]] <- construct_issue(
        value = identify_extra_rows(  BASE, COMP,   KEYS )   ,
        message = "There are rows in BASE that are not in COMPARE !!"
    )
    
    
    COMPARE[["ExtRowsComp"]] <- construct_issue(
        value = identify_extra_rows(  COMP, BASE,   KEYS )   ,
        message = "There are rows in COMPARE that are not in BASE !!"
    )
    
    
    
    COMPARE[["ExtColsBase"]] <- construct_issue(
        value =  identify_extra_cols(BASE,  COMP)   ,
        message = "There are columns in BASE that are not in COMPARE !!"
    )
    
    
    COMPARE[["ExtColsComp"]] <- construct_issue(
        value =  identify_extra_cols(COMP,  BASE)   ,
        message = "There are columns in COMPARE that are not in BASE !!"
    )
    
    
    VALUE_DIFFERENCES <- identify_differences(
        BASE, COMP , KEYS, exclude_cols,
        tolerance = tolerance,
        scale = scale
    )

    
    
    ## Summarise the number of mismatching rows per variable

    if ( length(VALUE_DIFFERENCES) ){
        NDIFF  <- sapply( VALUE_DIFFERENCES , nrow )
        COMPARE[["NumDiff"]] <- construct_issue(
            value = convert_to_issue(NDIFF),
            message = "Not all Values Compared Equal"
        )
    }


    for ( i in names(VALUE_DIFFERENCES) ){
        COMPARE[[ paste0( "VarDiff_", i)]] <- construct_issue(
            value = VALUE_DIFFERENCES[[i]] ,
            message = ""
        )
    }
    
    ## Get all issue messages, remove blank message, and collapse into single string
    ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x))
    ISSUE_MSGS <- ISSUE_MSGS[ ISSUE_MSGS != ""]
    
    if( length(ISSUE_MSGS) != 0 ){
        if(!SUPWARN) {
            ISSUE_MSGS <- paste(ISSUE_MSGS, collapse ='\n' )
            warning( c("\n" , ISSUE_MSGS))
        }
    } 
    
    
    if (!is.null(file)){
        x <- print(COMPARE , as_string = TRUE)
        
        tryCatch(
            {
                sink(file)
                cat(x, sep = "\n")
                sink()
            },
            warning = function(w){
                sink() 
                warning(w)
            },
            error = function(e){
                sink()
                stop(e)
            }
        )
        return(invisible(COMPARE))
        
    }
    
    return(COMPARE)
}




#' diffdf_has_issues
#' 
#' Utility function which returns TRUE if an diffdf
#' object has issues or FALSE if an diffdf object does not have issues
#' @param x diffdf object
#' @examples
#' 
#' # Example with no issues
#' x <- diffdf( iris, iris )
#' diffdf_has_issues(x)
#' 
#' # Example with issues
#' iris2 <- iris
#' iris2[2,2] <- NA
#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE)
#' diffdf_has_issues(x)
#' @export
diffdf_has_issues <- function(x){
    if (  class(x)[[1]] != "diffdf" )  stop( "x is not an diffdf object")
    return( length(x) != 0 ) 
}







#' diffdf_issuerows
#' 
#' This function takes a diffdf object and a dataframe and subsets
#' the dataframe for problem rows as identified in the comparison object.
#' If \code{vars} has been specified only issue rows associated with those
#' variable(s) will be returned. 
#' @param df dataframe to be subsetted
#' @param diff diffdf object
#' @param vars (optional) character vector containing names of issue variables to subset dataframe 
#' on. A value of NULL (default) will be taken to mean available issue variables.
#' @examples 
#' iris2 <- iris
#' for ( i in 1:3) iris2[i,i] <- 99
#' x <- diffdf( iris , iris2, suppress_warnings = TRUE)
#' diffdf_issuerows( iris , x)
#' diffdf_issuerows( iris2 , x)
#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length")
#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width"))
#' @details 
#' Note that diffdf_issuerows can be used to subset against any dataframe. The only
#' requirement is that the original variables specified in the keys argument to diffdf
#' are present on the dataframe you are subsetting against. However please note that if 
#' no keys were specified in diffdf then the row number is used. This means using 
#' diffdf_issuerows without a keys against an arbitrary dataset can easily result in 
#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
#' @export
diffdf_issuerows <- function( df , diff, vars = NULL){    

    if ( class(diff)[[1]] != "diffdf") {
        stop("diff should be an diffdf object")
    }
    
    KEYS_ATT = attr(diff, "keys")
    
    if( is.null(KEYS_ATT) ) {
        stop("diff is missing the keys attribute")
    }
    
    issue_vars <- names(diff)[grep( "^VarDiff_", names(diff))]
    
    if ( is.null(vars)){
        vars <- issue_vars
    }  else {
        vars <- paste0("VarDiff_" , vars)
    }
    
    if ( length(issue_vars) == 0 | sum( vars %in% issue_vars) == 0){
        return(df[FALSE,])
    }
    
    KEEP <- mapply(
        FUN      = get_issue_dataset, 
        issue    = vars, 
        diff     = list(diff) , 
        SIMPLIFY = F
    )
    
    KEEP <- recursive_reduce( KEEP , rbind)
    KEEP <- KEEP[!duplicated(KEEP),]
    
    if ( KEYS_ATT$is_derived ){
        df[[KEYS_ATT$value]] <- 1:nrow(df)
    }

    keys <- KEYS_ATT$value
    
    if ( any( ! keys %in% names(df))){
        stop("df does not contain all variables specified as keys in diff")
    }
    
    RET <- merge( 
        x = df,
        y = KEEP,
        sort = TRUE
    )  
    
    RET <- RET[do.call("order", RET[keys]), ]
    
    if ( KEYS_ATT$is_derived ){
        keep_vars <- !names(RET) %in% KEYS_ATT$value
        RET <- RET[, keep_vars , drop = FALSE]
    }
    
    return(RET)
}




#' get_issue_dataset
#' 
#' Internal function used by diffdf_issuerows to extract the dataframe
#' from each a target issue. In particular it also strips off any 
#' non-key variables
#' @param issue name of issue to extract the dataset from diff 
#' @param diff diffdf object which contains issues
get_issue_dataset <- function(issue, diff){
    issue_df <- diff[[issue]]
    keep <- names(issue_df)[ !(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
    issue_df[,keep , drop=FALSE]
}



#' string_pad
#'  
#' Utility function used to replicate str_pad. Adds white space to either end 
#' of a string to get it to equal the desired length
#' @param x string
#' @param width desired length 
string_pad <- function(x , width){
    if ( nchar(x) >= width) return(x)
    width <- width - nchar(x)
    left <- paste0( rep ( " " ,  floor( width/2) )  , collapse= "") 
    right <- paste0( rep ( " " ,  ceiling( width/2) )  , collapse= "") 
    paste0(  left , x ,right , collapse = "")
}


#' recursive_reduce
#'  
#' Utility function used to replicated purrr::reduce. Recursively applies a 
#' function to a list of elements until only 1 element remains
#' @param .l list of values to apply a function to
#' @param .f function to apply to each each element of the list in turn
#' i.e. .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
recursive_reduce <- function(.l , .f){
    if (length(.l) != 1){
        .l[[2]] <- .f( .l[[1]] , .l[[2]])  
        return( recursive_reduce( .l[-1] , .f))
    } else {
        return(.l[[1]])
    }
}

#' invert
#'
#' Utility function used to replicated purrr::transpose. Turns a list inside
#' out. 
#' @param x list 
invert <- function(x){
    x2 <- list() 
    cnames <- names(x)
    tnames <- names(x[[1]])
    for ( i in tnames ){
        x2[[i]] <- list()
        for (j in cnames){
            x2[[i]][[j]] <- x[[j]][[i]]
        }
    }
    return(x2)
}




#' as_ascii_table
#' 
#' This function takes a data.frame and attempts to convert it into
#' a simple ascii format suitable for printing to the screen
#' It is assumed all variable values have a as.character() method
#' in order to cast them to character. 
#' @param dat Input dataset to convert into a ascii table
#' @param line_prefix Symbols to prefix infront of every line of the table
as_ascii_table <- function(dat, line_prefix = "  "){
 
    
    ## Convert every value to character and crop to a suitable length
    dat  <- as_tibble(apply(dat, c(1, 2), as_cropped_char))
    
    hold <- list()
    COLS <- colnames(dat)
    
    ### For each column extract core elements (width, values , title) and pad out
    ### each string to be a suitable length
    for ( i in 1:ncol(dat)){
        COL <- COLS[i]
        VALUES <- dat[[i]]
        
        JOINT <- c(COL , VALUES)
        WIDTH <- max( sapply(JOINT, nchar)) + 2
        
        hold[[COL]] <- list() 
        hold[[COL]]$WIDTH <- WIDTH 
        hold[[COL]]$VALUES <- sapply( VALUES ,string_pad,  width = WIDTH )  
        hold[[COL]]$HEADER <- sapply( COL ,string_pad,  width = WIDTH )
    }
    
    ### Collapse into a single value per component ( title , values, width )
    thold  <- invert(hold)
    tvals  <- recursive_reduce( thold$VALUES , paste0 ) 
    thead  <- recursive_reduce( thold$HEADER , paste0)
    twidth <- recursive_reduce( thold$WIDTH , sum)
    
    ### Create header and footer lines
    TLINE <- paste0(rep("=" , twidth), collapse = "")
    LINE  <- paste0(rep("-" , twidth), collapse = "")
    FVALS <- paste0(line_prefix, tvals , collapse = "\n")
    
    ### Output table
    paste0( 
        "\n",
        line_prefix, TLINE, "\n",
        line_prefix, thead, "\n",
        line_prefix, LINE,  "\n",
        FVALS, "\n",
        line_prefix, LINE
    )     
}








#' as_cropped_char
#'
#' Makes any character string above x chars
#' Reduce down to a x char string with ...
#' @param inval a single element value
#' @param crop_at character limit
as_cropped_char <- function(inval, crop_at = 30 ){
    
    if ( is.null(inval) ){
        
        inval <- "<NULL>" 
        
    } else if ( is.na(inval)){
        
        inval <- "<NA>"
        
    } else {
        
        inval <- as.character(inval)
        
    }
    
    charlength <- sapply(inval, nchar)
    
    if (charlength > crop_at ){
        
        outval <- substr(inval, 1, crop_at )
        outval <- paste0(outval, '...')
        
    } else {
        
        outval <- inval
        
    }
    
    outval
}



#' get_table
#' 
#' Generate nice looking table from a data frame
#' @param dsin dataset 
#' @param row_limit Maximum number of rows displayed in dataset
get_table <- function(dsin , row_limit = 10){
    
    if( nrow(dsin) == 0 ) {
        return("")
    }
    
    display_table <- subset(dsin ,  1:nrow(dsin) < (row_limit + 1) )
    
    if ( nrow(dsin) > row_limit ){
        
        add_message <- paste0(
            'First ',
            row_limit, 
            " of " ,
            nrow(dsin),
            ' rows are shown in table below'
        )
        
    } else {
        add_message <- 'All rows are shown in table below'
    } 
    
    msg <- paste(
        c(
            add_message,
            as_ascii_table(display_table),
            '\n'
        ),
        collapse = '\n'
    )
    
    return(msg)
}




#' Print diffdf objects
#' 
#' Print nicely formatted version of an diffdf object
#' @param x comparison object created by diffdf().
#' @param ... Additional arguments (not used)
#' @param as_string Return printed message as an R character vector? 
#' @examples
#' x <- subset( iris , -Species )
#' x[1,2] <- 5
#' COMPARE <- diffdf( iris, x)
#' print( COMPARE )
#' print( COMPARE , "Sepal.Length" )
#' @export 
print.diffdf <- function(x, ..., as_string = FALSE){
    COMPARE <- x

    if ( length(COMPARE) == 0 ){
        outtext <- "No issues were found!\n"
        
    } else {
        
        start_text <- paste0(
            'Differences found between the objects!\n\n',
            'A summary is given below.\n\n'
        )

        end_text <- lapply(COMPARE, function(x) get_print_message(x) ) 
        end_text <- paste0(unlist(end_text), collapse = "")

        outtext <- paste0(start_text, end_text)
    }
    
    if ( as_string){
        return(strsplit(outtext, '\n')[[1]])
    } else {
        cat(outtext)
        return(invisible(COMPARE))
    }
}




#' construct_issue
#' 
#' Make an s3 object with class issue and possible additional class,
#' and assign other arguments to attributes
#' @param value the value of the object
#' @param message the value of the message attribute
#' @param add_class additional class to add
construct_issue <- function(value, message, add_class = NULL){
    x <- value
    
    ### If nothing has been provided return nothing !
    if ( nrow(x) == 0 ) return(NULL)
    
    class(x) <- c(add_class, "issue", class(x))
    attributes(x)[["message"]] <- message
    return(x)
}


#' get_issue_message
#' 
#' Simple function to grab the issue message
#' @param object inputted object of class issue
#' @param ... other arguments
get_issue_message <- function (object, ...) {
    return( attr(object,"message"))
}


#' get_print_message
#' 
#' Get the required text depending on type of issue
#' @param object inputted object of class issue
#' @param ... other arguments
get_print_message <- function (object, ...) {
    UseMethod("get_print_message", object)
}


#' get_print_message.default
#' 
#' Errors, as this should only ever be given an issue
#' @param object issue
get_print_message.default <- function(object) {
    stop("Error: An issue has not been provided to this function!")
}


#' get_print_message.issue
#'
#' Get text from a basic issue, based on the class of the value of the issue
#'
#' @param object an object of class issue_basic
get_print_message.issue <- function(object){
    paste(
        c(attr(object, "message"),  get_table(object) ),
        collapse = '\n'
    )
}











#' is_variable_different 
#' 
#' This subsets the data set on the variable name, picks out differences and returns a tibble
#' of differences for the given variable
#' @importFrom tibble as_tibble
#' @param variablename name of variable being compared
#' @param keynames name of keys
#' @param datain Inputted dataset with base and compare vectors
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
#' @return A boolean vector which is T if target and current are different
is_variable_different <- function (variablename, keynames, datain, ...) {

    xvar <- paste0(variablename,'.x')
    yvar <- paste0(variablename,'.y')
    
    if ( ! xvar %in% names(datain) | ! yvar %in% names(datain)){
        stop("Variable does not exist within input dataset")
    }
    
    target  <- datain[[xvar]]
    current <- datain[[yvar]]
    outvect <- find_difference(target, current, ...)
    
    datain[["VARIABLE"]] <- variablename
    
    names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
    
    x <- as_tibble(
        subset(
            datain, 
            outvect, 
            select = c("VARIABLE", keynames, "BASE", "COMPARE")
        )
    )
    
    return(x)
}

#' compare_vectors
#' 
#' Compare two vectors looking for differences
#' 
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
compare_vectors <- function (target, current, ...) {
    UseMethod("compare_vectors")
}


#' find_difference
#' 
#' This determines if two vectors are different. It expects vectors of the same
#' length and type, and is intended to be used after checks have already been done
#' Initially picks out any nas (matching nas count as a match)
#' Then compares remaining vector
#' 
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
find_difference <- function (target, current, ...) {
    
    if( length(target) != length(current)){
        warning("Inputs are not of the same length")
        return(NULL)
    }
    
    if( is.null(target) | is.null(current) ){
        return( is.null(target) != is.null(current) )
    } 
    
    ### Initalise output, assume problem unless evidence otherwise
    return_vector <- rep(TRUE, length(target))
    
    nas_t <- is.na(target) 
    nas_c <- is.na(current)
    
    ## compare missing values
    nacompare <- nas_t != nas_c
    naselect <- nas_t|nas_c
    return_vector[naselect]  <- nacompare[naselect]
    
    ## compare non-missing values
    selectvector <- as.logical( (!nas_t) * (!nas_c) )
    
    comparevect <- compare_vectors( 
        target[selectvector] ,
        current[selectvector], 
        ...
    )
    
    return_vector[selectvector] <- comparevect
    
    return(return_vector)
}







#' compare_vectors.default
#' 
#' Default method, if the vector is not numeric or factor. Basic comparison
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
compare_vectors.default <- function(target, current, ...){
    target != current 
}




#' compare_vectors.factor
#' 
#' Compares factors. Sets them as character and then compares
#' @param target the base vector
#' @param current a vector to compare target to
#' @param ...  Additional arguments which might be passed through (numerical accuracy)
compare_vectors.factor <- function(target, current, ...){
    as.character(target) != as.character(current) 
}





#' compare_vectors.numeric
#' 
#' This is a modified version of the all.equal function
#' which returns a vector rather than a message
#' @param target the base vector
#' @param current a vector to compare target to
#' @param tolerance Level of tolerance for differences between two variables
#' @param scale Scale that tolerance should be set on. If NULL assume absolute
compare_vectors.numeric <- function(
    target, 
    current, 
    tolerance = sqrt(.Machine$double.eps),
    scale = NULL
){

    out <- target == current
    
    if (all(out)) {
        return(!out)
    }
    
    if (is.integer(target) || is.integer(current)){ 
        target <- as.double(target)
        current <- as.double(current)
    }
    
    xy <- abs(target - current)
    
    if (!is.null(scale)) {
        xy <- xy/scale
    }
    
    return(xy > tolerance)

}





#'generate_keyname
#'
#'Function to generate a name for the keys if not provided
#'
#'@param BASE base dataset
#'@param COMP comparison dataset
#'@param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
#'
generate_keyname <- function(
    BASE, 
    COMP, 
    replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
){
    
    if ( class(replace_names) != "character"){
        stop( "replace_names is not a character vector")
    }
    
    if (length(replace_names) == 0) {
        stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument")
    }
    
    key_name <- replace_names[1]
    
    if (!is.null(BASE[[key_name]]) | !is.null( COMP[[key_name]])){
        key_name <- generate_keyname(BASE, COMP, replace_names[-1])
    }
    
    return(key_name)
    
}


#' sort_then_join
#' 
#' Convenience function to sort two strings and paste them together
#' @param string1 first string
#' @param string2 second string
sort_then_join <- function(string1, string2){
    paste0(sort(c(string1, string2)), collapse = '')
}


#' class_merge
#' 
#' Convenience function to put all classes an object has into one string
#' @param x an object
class_merge <- function(x){
    paste(class(x), collapse = "_")
}


get_message <- function( colname , whichdat, totype){
    message(paste0(
        "NOTE: Variable " , colname, " in " , tolower(whichdat) , " was casted to " , totype
    ))
}


#' get_casted_vector
#' 
#' casts a vector depending on its type and input
#' @param colin column to cast
#' @param colname name of vector
#' @param whichdat whether base or compare is being casted (used for messages)
get_casted_vector <- function(colin, colname,  whichdat){
    
    if ( class(colin) == "factor"){
        get_message( colname , whichdat , "character")
        return(as.character(colin))
    }
    
    if ( class(colin) == "integer"){
        get_message( colname , whichdat , "numeric")
        return(as.numeric(colin))
    }
    
    colin
}



#' get_casted_dataset
#' 
#' Internal utility function to loop across a dataset casting all target 
#' variables 
#' @param df dataset to be casted
#' @param columns columns to be casted
#' @param whichdat whether base or compare is being casted (used for messages)
get_casted_dataset <- function(df , columns , whichdat){
    for ( col in  columns ){
        df[[col]] <- get_casted_vector( df[[col]]  , col , whichdat )
    }
    return(df)
}






#' cast_variables
#' 
#' Function to cast datasets columns if they have differing types
#' Restricted to specific cases, currently integer and double, and character and factor
#' @param BASE base dataset
#' @param COMPARE comparison dataset
#' @param ignore_vars Variables not to be considered for casting
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
cast_variables <- function(
    BASE, 
    COMPARE, 
    ignore_vars = NULL, 
    cast_integers = FALSE , 
    cast_factors = FALSE 
){

    allowed_class_casts <- c( "integernumeric" , "characterfactor")[c(cast_integers, cast_factors)]
    
    BASE_class <- data.frame(
        class_BASE = sapply(BASE, class_merge), 
        colname = names(BASE),
        stringsAsFactors = FALSE
    )
    BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars,, drop=FALSE] 
    
    
    COMPARE_class <- data.frame(
        class_COMPARE = sapply(COMPARE, class_merge), 
        colname = names(COMPARE),
        stringsAsFactors = FALSE
    )
    COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars ,, drop=FALSE] 
    
    common_class <- merge(
        x = BASE_class, 
        y = COMPARE_class, 
        by = "colname"
    )
    
    
    diff_class <- common_class[ common_class[["class_BASE"]] != common_class[["class_COMPARE"]] ,,drop=FALSE]

    
    diff_class$classmerge <- mapply(
        sort_then_join, 
        diff_class$class_COMPARE, 
        diff_class$class_BASE
    )
    
    
    cast_columns <- diff_class[  diff_class[["classmerge"]] %in% allowed_class_casts ,,drop=FALSE] 
    
    
    DATASETS <- list(
        "BASE" = BASE,
        "COMPARE" = COMPARE
    )
    

    if( nrow(cast_columns) == 0 ){
        return( DATASETS )
    }
    
    
    for ( i in names(DATASETS)){
        DATASETS[[i]] <- get_casted_dataset( DATASETS[[i]] , cast_columns$colname , i)
    }
        
    
    return(DATASETS)
}




































#' factor_to_character
#' 
#' Takes a dataframe and converts any factor variables to character
#' @param dsin input dataframe 
#' @param vars variables to consider for conversion. Default NULL will consider 
#' every variable within the dataset
factor_to_character <- function( dsin , vars = NULL){
    
    if ( is.null(vars) ) vars = names(dsin)
    
    for (var in vars){
        if(  is.factor(dsin[[var]])){
            dsin[[var]] <- as.character(dsin[[var]])
        }
    }
    return(dsin)
}




#' has_unique_rows
#' 
#' Check if a data sets rows are unique
#' @param DAT input data set (data frame)
#' @param KEYS Set of keys which should be unique
has_unique_rows <- function(DAT , KEYS){
    DUPS <- duplicated( subset(DAT , select= KEYS) ) 
    NDUPS <- sum( DUPS)
    return( NDUPS == 0 )
}

#'convert_to_issue
#'
#'converts the count value into the correct issue format
#'@param datin data inputted
#'@importFrom tibble rownames_to_column
convert_to_issue <- function(datin){
    datin_tibble <- tibble(
        `Variable` = names(datin),
        `No of Differences` = datin
    )
    
    datin_tibble_reduced <- datin_tibble[ datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
    return(datin_tibble_reduced)
}


1
2
3
4
5
6		#' diffdf_issuerows
7		#'
8		#' This function takes a diffdf object and a dataframe and subsets
9		#' the dataframe for problem rows as identified in the comparison object.
10		#' If \code{vars} has been specified only issue rows associated with those
11		#' variable(s) will be returned.
12		#' @param df dataframe to be subsetted
13		#' @param diff diffdf object
14		#' @param vars (optional) character vector containing names of issue variables to subset dataframe
15		#' on. A value of NULL (default) will be taken to mean available issue variables.
16		#' @examples
17		#' iris2 <- iris
18		#' for ( i in 1:3) iris2[i,i] <- 99
19		#' x <- diffdf( iris , iris2, suppress_warnings = TRUE)
20		#' diffdf_issuerows( iris , x)
21		#' diffdf_issuerows( iris2 , x)
22		#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length")
23		#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width"))
24		#' @details
25		#' Note that diffdf_issuerows can be used to subset against any dataframe. The only
26		#' requirement is that the original variables specified in the keys argument to diffdf
27		#' are present on the dataframe you are subsetting against. However please note that if
28		#' no keys were specified in diffdf then the row number is used. This means using
29		#' diffdf_issuerows without a keys against an arbitrary dataset can easily result in
30		#' nonsense rows being returned. It is always recommended to supply keys to diffdf.
31		#' @export
32		diffdf_issuerows <- function( df , diff, vars = NULL){
33
34	19x	if ( class(diff)[[1]] != "diffdf") {
35	!	stop("diff should be an diffdf object")
36		}
37
38	19x	KEYS_ATT = attr(diff, "keys")
39
40	19x	if( is.null(KEYS_ATT) ) {
41	!	stop("diff is missing the keys attribute")
42		}
43
44	19x	issue_vars <- names(diff)[grep( "^VarDiff_", names(diff))]
45
46	19x	if ( is.null(vars)){
47	15x	vars <- issue_vars
48		} else {
49	4x	vars <- paste0("VarDiff_" , vars)
50		}
51
52	19x	if ( length(issue_vars) == 0 \| sum( vars %in% issue_vars) == 0){
53	7x	return(df[FALSE,])
54		}
55
56	12x	KEEP <- mapply(
57	12x	FUN = get_issue_dataset,
58	12x	issue = vars,
59	12x	diff = list(diff) ,
60	12x	SIMPLIFY = F
61		)
62
63	12x	KEEP <- recursive_reduce( KEEP , rbind)
64	12x	KEEP <- KEEP[!duplicated(KEEP),]
65
66	12x	if ( KEYS_ATT$is_derived ){
67	2x	df[[KEYS_ATT$value]] <- 1:nrow(df)
68		}
69
70	12x	keys <- KEYS_ATT$value
71
72	12x	if ( any( ! keys %in% names(df))){
73	1x	stop("df does not contain all variables specified as keys in diff")
74		}
75
76	11x	RET <- merge(
77	11x	x = df,
78	11x	y = KEEP,
79	11x	sort = TRUE
80		)
81
82	11x	RET <- RET[do.call("order", RET[keys]), ]
83
84	11x	if ( KEYS_ATT$is_derived ){
85	2x	keep_vars <- !names(RET) %in% KEYS_ATT$value
86	2x	RET <- RET[, keep_vars , drop = FALSE]
87		}
88
89	11x	return(RET)
90		}
91
92
93
94
95		#' get_issue_dataset
96		#'
97		#' Internal function used by diffdf_issuerows to extract the dataframe
98		#' from each a target issue. In particular it also strips off any
99		#' non-key variables
100		#' @param issue name of issue to extract the dataset from diff
101		#' @param diff diffdf object which contains issues
102		get_issue_dataset <- function(issue, diff){
103	20x	issue_df <- diff[[issue]]
104	20x	keep <- names(issue_df)[ !(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
105	20x	issue_df[,keep , drop=FALSE]
106		}

1
2
3		#' string_pad
4		#'
5		#' Utility function used to replicate str_pad. Adds white space to either end
6		#' of a string to get it to equal the desired length
7		#' @param x string
8		#' @param width desired length
9		string_pad <- function(x , width){
10	!	if ( nchar(x) >= width) return(x)
11	814x	width <- width - nchar(x)
12	814x	left <- paste0( rep ( " " , floor( width/2) ) , collapse= "")
13	814x	right <- paste0( rep ( " " , ceiling( width/2) ) , collapse= "")
14	814x	paste0( left , x ,right , collapse = "")
15		}
16
17
18		#' recursive_reduce
19		#'
20		#' Utility function used to replicated purrr::reduce. Recursively applies a
21		#' function to a list of elements until only 1 element remains
22		#' @param .l list of values to apply a function to
23		#' @param .f function to apply to each each element of the list in turn
24		#' i.e. .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]])
25		recursive_reduce <- function(.l , .f){
26	557x	if (length(.l) != 1){
27	377x	.l[[2]] <- .f( .l[[1]] , .l[[2]])
28	377x	return( recursive_reduce( .l[-1] , .f))
29		} else {
30	180x	return(.l[[1]])
31		}
32		}
33
34		#' invert
35		#'
36		#' Utility function used to replicated purrr::transpose. Turns a list inside
37		#' out.
38		#' @param x list
39		invert <- function(x){
40	56x	x2 <- list()
41	56x	cnames <- names(x)
42	56x	tnames <- names(x[[1]])
43	56x	for ( i in tnames ){
44	168x	x2[[i]] <- list()
45	168x	for (j in cnames){
46	537x	x2[[i]][[j]] <- x[[j]][[i]]
47		}
48		}
49	56x	return(x2)
50		}
51
52
53
54
55		#' as_ascii_table
56		#'
57		#' This function takes a data.frame and attempts to convert it into
58		#' a simple ascii format suitable for printing to the screen
59		#' It is assumed all variable values have a as.character() method
60		#' in order to cast them to character.
61		#' @param dat Input dataset to convert into a ascii table
62		#' @param line_prefix Symbols to prefix infront of every line of the table
63		as_ascii_table <- function(dat, line_prefix = " "){
64
65
66		## Convert every value to character and crop to a suitable length
67	56x	dat <- as_tibble(apply(dat, c(1, 2), as_cropped_char))
68
69	56x	hold <- list()
70	56x	COLS <- colnames(dat)
71
72		### For each column extract core elements (width, values , title) and pad out
73		### each string to be a suitable length
74	56x	for ( i in 1:ncol(dat)){
75	179x	COL <- COLS[i]
76	179x	VALUES <- dat[[i]]
77
78	179x	JOINT <- c(COL , VALUES)
79	179x	WIDTH <- max( sapply(JOINT, nchar)) + 2
80
81	179x	hold[[COL]] <- list()
82	179x	hold[[COL]]$WIDTH <- WIDTH
83	179x	hold[[COL]]$VALUES <- sapply( VALUES ,string_pad, width = WIDTH )
84	179x	hold[[COL]]$HEADER <- sapply( COL ,string_pad, width = WIDTH )
85		}
86
87		### Collapse into a single value per component ( title , values, width )
88	56x	thold <- invert(hold)
89	56x	tvals <- recursive_reduce( thold$VALUES , paste0 )
90	56x	thead <- recursive_reduce( thold$HEADER , paste0)
91	56x	twidth <- recursive_reduce( thold$WIDTH , sum)
92
93		### Create header and footer lines
94	56x	TLINE <- paste0(rep("=" , twidth), collapse = "")
95	56x	LINE <- paste0(rep("-" , twidth), collapse = "")
96	56x	FVALS <- paste0(line_prefix, tvals , collapse = "\n")
97
98		### Output table
99	56x	paste0(
100	56x	"\n",
101	56x	line_prefix, TLINE, "\n",
102	56x	line_prefix, thead, "\n",
103	56x	line_prefix, LINE, "\n",
104	56x	FVALS, "\n",
105	56x	line_prefix, LINE
106		)
107		}
108
109
110
111
112
113
114
115
116		#' as_cropped_char
117		#'
118		#' Makes any character string above x chars
119		#' Reduce down to a x char string with ...
120		#' @param inval a single element value
121		#' @param crop_at character limit
122		as_cropped_char <- function(inval, crop_at = 30 ){
123
124	635x	if ( is.null(inval) ){
125
126	!	inval <- "<NULL>"
127
128	635x	} else if ( is.na(inval)){
129
130	1x	inval <- "<NA>"
131
132		} else {
133
134	634x	inval <- as.character(inval)
135
136		}
137
138	635x	charlength <- sapply(inval, nchar)
139
140	635x	if (charlength > crop_at ){
141
142	10x	outval <- substr(inval, 1, crop_at )
143	10x	outval <- paste0(outval, '...')
144
145		} else {
146
147	625x	outval <- inval
148
149		}
150
151	635x	outval
152		}
153
154
155
156		#' get_table
157		#'
158		#' Generate nice looking table from a data frame
159		#' @param dsin dataset
160		#' @param row_limit Maximum number of rows displayed in dataset
161		get_table <- function(dsin , row_limit = 10){
162
163	56x	if( nrow(dsin) == 0 ) {
164	!	return("")
165		}
166
167	56x	display_table <- subset(dsin , 1:nrow(dsin) < (row_limit + 1) )
168
169	56x	if ( nrow(dsin) > row_limit ){
170
171	4x	add_message <- paste0(
172	4x	'First ',
173	4x	row_limit,
174	4x	" of " ,
175	4x	nrow(dsin),
176	4x	' rows are shown in table below'
177		)
178
179		} else {
180	52x	add_message <- 'All rows are shown in table below'
181		}
182
183	56x	msg <- paste(
184	56x	c(
185	56x	add_message,
186	56x	as_ascii_table(display_table),
187	56x	'\n'
188		),
189	56x	collapse = '\n'
190		)
191
192	56x	return(msg)
193		}
194
195

1
2		#' Print diffdf objects
3		#'
4		#' Print nicely formatted version of an diffdf object
5		#' @param x comparison object created by diffdf().
6		#' @param ... Additional arguments (not used)
7		#' @param as_string Return printed message as an R character vector?
8		#' @examples
9		#' x <- subset( iris , -Species )
10		#' x[1,2] <- 5
11		#' COMPARE <- diffdf( iris, x)
12		#' print( COMPARE )
13		#' print( COMPARE , "Sepal.Length" )
14		#' @export
15		print.diffdf <- function(x, ..., as_string = FALSE){
16	21x	COMPARE <- x
17
18	21x	if ( length(COMPARE) == 0 ){
19	2x	outtext <- "No issues were found!\n"
20
21		} else {
22
23	19x	start_text <- paste0(
24	19x	'Differences found between the objects!\n\n',
25	19x	'A summary is given below.\n\n'
26		)
27
28	19x	end_text <- lapply(COMPARE, function(x) get_print_message(x) )
29	19x	end_text <- paste0(unlist(end_text), collapse = "")
30
31	19x	outtext <- paste0(start_text, end_text)
32		}
33
34	21x	if ( as_string){
35	21x	return(strsplit(outtext, '\n')[[1]])
36		} else {
37	!	cat(outtext)
38	!	return(invisible(COMPARE))
39		}
40		}
41
42
43

1		#' construct_issue
2		#'
3		#' Make an s3 object with class issue and possible additional class,
4		#' and assign other arguments to attributes
5		#' @param value the value of the object
6		#' @param message the value of the message attribute
7		#' @param add_class additional class to add
8		construct_issue <- function(value, message, add_class = NULL){
9	2280x	x <- value
10
11		### If nothing has been provided return nothing !
12	2074x	if ( nrow(x) == 0 ) return(NULL)
13
14	206x	class(x) <- c(add_class, "issue", class(x))
15	206x	attributes(x)[["message"]] <- message
16	206x	return(x)
17		}
18
19
20		#' get_issue_message
21		#'
22		#' Simple function to grab the issue message
23		#' @param object inputted object of class issue
24		#' @param ... other arguments
25		get_issue_message <- function (object, ...) {
26	206x	return( attr(object,"message"))
27		}
28
29
30		#' get_print_message
31		#'
32		#' Get the required text depending on type of issue
33		#' @param object inputted object of class issue
34		#' @param ... other arguments
35		get_print_message <- function (object, ...) {
36	56x	UseMethod("get_print_message", object)
37		}
38
39
40		#' get_print_message.default
41		#'
42		#' Errors, as this should only ever be given an issue
43		#' @param object issue
44		get_print_message.default <- function(object) {
45	!	stop("Error: An issue has not been provided to this function!")
46		}
47
48
49		#' get_print_message.issue
50		#'
51		#' Get text from a basic issue, based on the class of the value of the issue
52		#'
53		#' @param object an object of class issue_basic
54		get_print_message.issue <- function(object){
55	56x	paste(
56	56x	c(attr(object, "message"), get_table(object) ),
57	56x	collapse = '\n'
58		)
59		}
60
61
62
63
64
65
66
67
68
69

1		#' is_variable_different
2		#'
3		#' This subsets the data set on the variable name, picks out differences and returns a tibble
4		#' of differences for the given variable
5		#' @importFrom tibble as_tibble
6		#' @param variablename name of variable being compared
7		#' @param keynames name of keys
8		#' @param datain Inputted dataset with base and compare vectors
9		#' @param ... Additional arguments which might be passed through (numerical accuracy)
10		#' @return A boolean vector which is T if target and current are different
11		is_variable_different <- function (variablename, keynames, datain, ...) {
12
13	991x	xvar <- paste0(variablename,'.x')
14	991x	yvar <- paste0(variablename,'.y')
15
16	991x	if ( ! xvar %in% names(datain) \| ! yvar %in% names(datain)){
17	!	stop("Variable does not exist within input dataset")
18		}
19
20	991x	target <- datain[[xvar]]
21	991x	current <- datain[[yvar]]
22	991x	outvect <- find_difference(target, current, ...)
23
24	991x	datain[["VARIABLE"]] <- variablename
25
26	991x	names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
27
28	991x	x <- as_tibble(
29	991x	subset(
30	991x	datain,
31	991x	outvect,
32	991x	select = c("VARIABLE", keynames, "BASE", "COMPARE")
33		)
34		)
35
36	991x	return(x)
37		}
38
39		#' compare_vectors
40		#'
41		#' Compare two vectors looking for differences
42		#'
43		#' @param target the base vector
44		#' @param current a vector to compare target to
45		#' @param ... Additional arguments which might be passed through (numerical accuracy)
46		compare_vectors <- function (target, current, ...) {
47	1020x	UseMethod("compare_vectors")
48		}
49
50
51		#' find_difference
52		#'
53		#' This determines if two vectors are different. It expects vectors of the same
54		#' length and type, and is intended to be used after checks have already been done
55		#' Initially picks out any nas (matching nas count as a match)
56		#' Then compares remaining vector
57		#'
58		#' @param target the base vector
59		#' @param current a vector to compare target to
60		#' @param ... Additional arguments which might be passed through (numerical accuracy)
61		find_difference <- function (target, current, ...) {
62
63	1023x	if( length(target) != length(current)){
64	2x	warning("Inputs are not of the same length")
65	2x	return(NULL)
66		}
67
68	1021x	if( is.null(target) \| is.null(current) ){
69	1x	return( is.null(target) != is.null(current) )
70		}
71
72		### Initalise output, assume problem unless evidence otherwise
73	1020x	return_vector <- rep(TRUE, length(target))
74
75	1020x	nas_t <- is.na(target)
76	1020x	nas_c <- is.na(current)
77
78		## compare missing values
79	1020x	nacompare <- nas_t != nas_c
80	1020x	naselect <- nas_t\|nas_c
81	1020x	return_vector[naselect] <- nacompare[naselect]
82
83		## compare non-missing values
84	1020x	selectvector <- as.logical( (!nas_t) * (!nas_c) )
85
86	1020x	comparevect <- compare_vectors(
87	1020x	target[selectvector] ,
88	1020x	current[selectvector],
89		...
90		)
91
92	1020x	return_vector[selectvector] <- comparevect
93
94	1020x	return(return_vector)
95		}
96
97
98
99
100
101
102
103		#' compare_vectors.default
104		#'
105		#' Default method, if the vector is not numeric or factor. Basic comparison
106		#' @param target the base vector
107		#' @param current a vector to compare target to
108		#' @param ... Additional arguments which might be passed through (numerical accuracy)
109		compare_vectors.default <- function(target, current, ...){
110	445x	target != current
111		}
112
113
114
115
116		#' compare_vectors.factor
117		#'
118		#' Compares factors. Sets them as character and then compares
119		#' @param target the base vector
120		#' @param current a vector to compare target to
121		#' @param ... Additional arguments which might be passed through (numerical accuracy)
122		compare_vectors.factor <- function(target, current, ...){
123	90x	as.character(target) != as.character(current)
124		}
125
126
127
128
129
130		#' compare_vectors.numeric
131		#'
132		#' This is a modified version of the all.equal function
133		#' which returns a vector rather than a message
134		#' @param target the base vector
135		#' @param current a vector to compare target to
136		#' @param tolerance Level of tolerance for differences between two variables
137		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
138		compare_vectors.numeric <- function(
139		target,
140		current,
141		tolerance = sqrt(.Machine$double.eps),
142		scale = NULL
143		){
144
145	485x	out <- target == current
146
147	485x	if (all(out)) {
148	451x	return(!out)
149		}
150
151	34x	if (is.integer(target) \|\| is.integer(current)){
152	11x	target <- as.double(target)
153	11x	current <- as.double(current)
154		}
155
156	34x	xy <- abs(target - current)
157
158	34x	if (!is.null(scale)) {
159	2x	xy <- xy/scale
160		}
161
162	34x	return(xy > tolerance)
163
164		}
165
166
167
168

1
2
3		#' identify_extra_rows
4		#'
5		#' Identifies rows that are in a baseline dataset but not in a comparator dataset
6		#' @param DS1 Baseline dataset (data frame)
7		#' @param DS2 Comparator dataset (data frame)
8		#' @param KEYS List of variables that define a unique row within the datasets (strings)
9		identify_extra_rows <- function(DS1, DS2 , KEYS){
10	258x	DS2[["..FLAG.."]] <- "Y"
11	258x	dat <- merge(
12	258x	subset( DS1 , select = KEYS) ,
13	258x	subset( DS2 , select = c(KEYS, "..FLAG..")) ,
14	258x	by = KEYS, all.x = T,
15	258x	sort = TRUE
16		)
17	258x	dat <- dat[do.call("order", dat[KEYS]), ]
18
19	258x	dat[ is.na(dat[["..FLAG.."]]) , KEYS, drop=FALSE]
20		}
21
22
23
24		#' identify_extra_cols
25		#'
26		#' Identifies columns that are in a baseline dataset but not in a comparator dataset
27		#' @param DS1 Baseline dataset (data frame)
28		#' @param DS2 Comparator dataset (data frame)
29		#' @importFrom tibble tibble
30		identify_extra_cols <- function(DS1 , DS2){
31	258x	match.cols <- sapply ( names(DS1), "%in%", names(DS2))
32	258x	if ( !all(is.logical(match.cols)) ){
33	!	stop("Assumption of logical return type is not true")
34		}
35	258x	tibble(
36	258x	COLUMNS = names(DS1)[ !match.cols]
37		)
38		}
39
40
41
42
43
44
45
46		#' identify_matching_cols
47		#'
48		#' Identifies columns with the same name in two data frames
49		#' @param DS1 Input dataset 1 (data frame)
50		#' @param DS2 Input dataset 2 (data frame)
51		#' @param EXCLUDE Columns to ignore
52		identify_matching_cols <- function(DS1, DS2 , EXCLUDE = ""){
53	826x	match_cols <- sapply( names(DS1), "%in%" , names(DS2))
54	826x	exclude_cols <- sapply( names(DS1), "%in%" , EXCLUDE)
55	826x	names(DS1)[ match_cols & !exclude_cols ]
56		}
57
58
59
60
61
62		#' identify_unsupported_cols
63		#'
64		#' Identifies any columns for which the package is not setup to handle
65		#' @param dsin input dataset
66		identify_unsupported_cols <- function(dsin){
67	258x	dat <- subset(
68	258x	identify_properties(dsin) ,
69	258x	select = c("VARIABLE", "MODE")
70		)
71
72	258x	dat[ !dat[["MODE"]] %in% c('numeric', 'character', 'logical') ,, drop=FALSE]
73
74		}
75
76
77
78		#' identify_mode_differences
79		#'
80		#' Identifies any mode differences between two data frames
81		#' @param BASE Base dataset for comparison (data.frame)
82		#' @param COMP Comparator dataset to compare base against (data.frame)
83		identify_mode_differences <- function( BASE, COMP ){
84
85	429x	matching_cols <- identify_matching_cols( BASE , COMP )
86
87	429x	dat <- merge(
88	429x	x = identify_properties(BASE),
89	429x	y = identify_properties(COMP),
90	429x	by = "VARIABLE",
91	429x	all = TRUE,
92	429x	suffixes = c(".BASE", ".COMP"),
93	429x	sort = TRUE
94		)
95	429x	dat <- subset( dat, select = c("VARIABLE" , "MODE.BASE" , "MODE.COMP"))
96
97	429x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
98	429x	KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]]
99
100	429x	dat[ KEEP1 & KEEP2 ,, drop=FALSE]
101
102		}
103
104
105
106		#' identify_class_differences
107		#'
108		#' Identifies any class differences between two data frames
109		#' @param BASE Base dataset for comparison (data.frame)
110		#' @param COMP Comparator dataset to compare base against (data.frame)
111		identify_class_differences <- function( BASE, COMP ){
112
113	129x	matching_cols <- identify_matching_cols( BASE , COMP )
114
115	129x	dat <- merge(
116	129x	x = identify_properties(BASE),
117	129x	y = identify_properties(COMP),
118	129x	by = "VARIABLE",
119	129x	all = TRUE,
120	129x	sort = TRUE,
121	129x	suffixes = c(".BASE", ".COMP")
122		)
123
124	129x	dat <- subset( dat , select = c("VARIABLE" , "CLASS.BASE" , "CLASS.COMP"))
125
126	129x	KEEP1 <- dat[["VARIABLE"]] %in% matching_cols
127	129x	KEEP2 <- !mapply(
128	129x	identical,
129	129x	dat[["CLASS.BASE"]] ,
130	129x	dat[["CLASS.COMP"]]
131		)
132
133	129x	dat[ KEEP1 & KEEP2 ,, drop=FALSE]
134
135		}
136
137
138
139		#' identify_att_differences
140		#'
141		#' Identifies any attribute differences between two data frames
142		#' @param BASE Base dataset for comparison (data.frame)
143		#' @param COMP Comparator dataset to compare base against (data.frame)
144		#' @param exclude_cols Columns to exclude from comparison
145		#' @importFrom tibble tibble
146		identify_att_differences <- function( BASE, COMP , exclude_cols = "" ){
147
148	139x	matching_cols <- identify_matching_cols( BASE , COMP , exclude_cols )
149
150	139x	PROPS <- merge(
151	139x	x = identify_properties(BASE) ,
152	139x	y = identify_properties(COMP) ,
153	139x	by = "VARIABLE",
154	139x	all = TRUE,
155	139x	sort = TRUE,
156	139x	suffixes = c(".BASE", ".COMP")
157		)
158
159	139x	PROPS <- subset( PROPS , select = c("VARIABLE", "ATTRIBS.BASE" , "ATTRIBS.COMP"))
160
161	139x	PROPS <- PROPS[ PROPS[["VARIABLE"]] %in% matching_cols,, drop = FALSE]
162
163
164		### Setup dummy return value
165	139x	RETURN <- tibble(
166	139x	VARIABLE = character(),
167	139x	ATTR_NAME = character(),
168	139x	VALUES.BASE = list(),
169	139x	VALUES.COMP = list()
170		)
171
172	139x	for ( i in PROPS[["VARIABLE"]] ){
173
174	1237x	PROPS_filt <- PROPS[ PROPS[["VARIABLE"]] == i ,, drop=FALSE]
175
176		### Get a vector of all available attributes across both variables
177	1237x	ATTRIB_NAMES = unique(c(
178	1237x	names(PROPS_filt[["ATTRIBS.BASE"]][[1]]) ,
179	1237x	names(PROPS_filt[["ATTRIBS.COMP"]][[1]])
180		))
181
182		### If variable has no attributes move onto the next variable
183	904x	if ( is.null(ATTRIB_NAMES) ) next()
184
185		### Loop over each attribute checking if they are identical and outputing
186		### anyones that arn't
187	333x	for ( j in ATTRIB_NAMES){
188
189	574x	ATTRIB_BASE = PROPS_filt[["ATTRIBS.BASE"]][[1]][j]
190	574x	ATTRIB_COMP = PROPS_filt[["ATTRIBS.COMP"]][[1]][j]
191
192	574x	if ( !identical(ATTRIB_BASE , ATTRIB_COMP) ){
193
194	89x	ATT_DIFFS <- tibble(
195	89x	VARIABLE = i ,
196	89x	ATTR_NAME = j ,
197	89x	VALUES.BASE = ifelse( is.null(ATTRIB_BASE) , list() , ATTRIB_BASE),
198	89x	VALUES.COMP = ifelse( is.null(ATTRIB_COMP) , list() , ATTRIB_COMP)
199		)
200
201	89x	RETURN <- rbind(RETURN , ATT_DIFFS)
202		}
203		}
204		}
205	139x	return(RETURN)
206		}
207
208
209
210
211
212		#' identify_differences
213		#'
214		#' Compares each column within 2 datasets to identify any values which they
215		#' mismatch on.
216		#' @param BASE Base dataset for comparison (data.frame)
217		#' @param COMP Comparator dataset to compare base against (data.frame)
218		#' @param KEYS List of variables that define a unique row within the datasets (strings)
219		#' @param exclude_cols Columns to exclude from comparison
220		#' @param tolerance Level of tolerance for numeric differences between two variables
221		#' @param scale Scale that tolerance should be set on. If NULL assume absolute
222		identify_differences <- function( BASE , COMP , KEYS, exclude_cols,
223		tolerance = sqrt(.Machine$double.eps),
224		scale = NULL ) {
225
226	129x	matching_cols <- identify_matching_cols( BASE , COMP , c(KEYS, exclude_cols))
227
228	1x	if( length(matching_cols) == 0 ) return ( tibble() )
229
230	128x	DAT = merge(
231	128x	x = BASE ,
232	128x	y = COMP ,
233	128x	by = KEYS ,
234	128x	suffix = c(".x", ".y"),
235	128x	sort = TRUE
236		)
237	128x	DAT <- DAT[do.call("order", DAT[KEYS]), ]
238
239
240	128x	matching_list <- mapply(
241	128x	is_variable_different ,
242	128x	matching_cols,
243	128x	MoreArgs = list(
244	128x	keynames = KEYS,
245	128x	datain = DAT,
246	128x	tolerance = tolerance ,
247	128x	scale = scale
248		),
249	128x	SIMPLIFY = FALSE
250		)
251
252	128x	matching_list
253		}
254
255
256
257
258
259
260
261
262
263		#' identify_properties
264		#'
265		#' Returns a dataframe of metadata for a given dataset.
266		#' Returned values include variable names , class , mode , type & attributes
267		#' @param dsin input dataframe that you want to get the metadata from
268		#' @importFrom tibble tibble
269		identify_properties <- function(dsin){
270
271		### If missing or null return empty dataset
272	1652x	if( is.null(dsin) ) {
273	!	x <- tibble(
274	!	VARIABLE = character(),
275	!	CLASS = list(),
276	!	MODE = character(),
277	!	TYPE = character() ,
278	!	ATTRIBS = list()
279		)
280	!	return(x)
281		}
282
283	1652x	tibble(
284	1652x	VARIABLE = names(dsin),
285	1652x	CLASS = lapply(dsin, class),
286	1652x	MODE = sapply(dsin , mode),
287	1652x	TYPE = sapply(dsin , typeof) ,
288	1652x	ATTRIBS = lapply( dsin , attributes)
289		)
290		}
291
292
293
294
295
296

1		#' diffdf
2		#' @description
3		#' Compares 2 dataframes and outputs any differences.
4		#' @param base input dataframe
5		#' @param compare comparison dataframe
6		#' @param keys vector of variables (as strings) that defines a unique row in the base and compare dataframes
7		#' @param strict_numeric Flag for strict numeric to numeric comparisons (default = TRUE). If False diffdf will cast integer to double where required for comparisons. Note that variables specified in the keys will never be casted.
8		#' @param strict_factor Flag for strict factor to character comparisons (default = TRUE). If False diffdf will cast factors to characters where required for comparisons. Note that variables specified in the keys will never be casted.
9		#' @param suppress_warnings Do you want to suppress warnings? (logical)
10		#' @param file Location and name of a text file to output the results to. Setting to NULL will cause no file to be produced.
11		#' @param tolerance Set tolerance for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance.
12		#' @param scale Set scale for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient version of scale = 1.
13		#' @examples
14		#' x <- subset( iris, -Species)
15		#' x[1,2] <- 5
16		#' COMPARE <- diffdf( iris, x)
17		#' print( COMPARE )
18		#' print( COMPARE , "Sepal.Length" )
19		#'
20		#' #### Sample data frames
21		#'
22		#' DF1 <- data.frame(
23		#' id = c(1,2,3,4,5,6),
24		#' v1 = letters[1:6],
25		#' v2 = c(NA , NA , 1 , 2 , 3 , NA)
26		#' )
27		#'
28		#' DF2 <- data.frame(
29		#' id = c(1,2,3,4,5,7),
30		#' v1 = letters[1:6],
31		#' v2 = c(NA , NA , 1 , 2 , NA , NA),
32		#' v3 = c(NA , NA , 1 , 2 , NA , 4)
33		#' )
34		#'
35		#' diffdf(DF1 , DF1 , keys = "id")
36		#'
37		#' # We can control matching with scale/location for example:
38		#'
39		#' DF1 <- data.frame(
40		#' id = c(1,2,3,4,5,6),
41		#' v1 = letters[1:6],
42		#' v2 = c(1,2,3,4,5,6)
43		#' )
44		#' DF2 <- data.frame(
45		#' id = c(1,2,3,4,5,6),
46		#' v1 = letters[1:6],
47		#' v2 = c(1.1,2,3,4,5,6)
48		#' )
49		#'
50		#' diffdf(DF1 , DF2 , keys = "id")
51		#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2)
52		#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2)
53		#'
54		#' # We can use strict_factor to compare factors with characters for example:
55		#'
56		#' DF1 <- data.frame(
57		#' id = c(1,2,3,4,5,6),
58		#' v1 = letters[1:6],
59		#' v2 = c(NA , NA , 1 , 2 , 3 , NA),
60		#' stringsAsFactors = FALSE
61		#' )
62		#'
63		#' DF2 <- data.frame(
64		#' id = c(1,2,3,4,5,6),
65		#' v1 = letters[1:6],
66		#' v2 = c(NA , NA , 1 , 2 , 3 , NA)
67		#' )
68		#'
69		#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE)
70		#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE)
71		#'
72		#' @export
73		diffdf <- function (
74		base ,
75		compare ,
76		keys = NULL,
77		suppress_warnings = FALSE,
78		strict_numeric = TRUE,
79		strict_factor = TRUE,
80		file = NULL,
81		tolerance = sqrt(.Machine$double.eps),
82		scale = NULL
83		){
84
85	134x	BASE = base
86	134x	COMP = compare
87	134x	KEYS = keys
88	134x	SUPWARN = suppress_warnings
89
90
91		### Initatiate output object
92	134x	COMPARE <- list()
93	134x	class(COMPARE) <- c("diffdf" , "list")
94
95	134x	is_derived <- FALSE
96
97		### If no key is suplied match values based upon row number
98	134x	if (is.null(KEYS)){
99	122x	is_derived <- TRUE
100	122x	keyname <- generate_keyname(BASE, COMP)
101	122x	BASE[[keyname]] <- 1:nrow(BASE)
102	122x	COMP[[keyname]] <- 1:nrow(COMP)
103	122x	KEYS <- keyname
104		}
105	134x	attr(COMPARE, 'keys') <- list(value = KEYS, is_derived = is_derived)
106
107
108
109	134x	if (!is.numeric(tolerance)) {
110	2x	stop("'tolerance' should be numeric")
111		}
112
113	132x	if (!is.numeric(scale) && !is.null(scale)) {
114	2x	stop("'scale' should be numeric or NULL")
115		}
116
117
118
119	130x	if ( !has_unique_rows(BASE , KEYS) ){
120	1x	stop( "BY variables in BASE do not result in unique observations")
121		}
122
123	129x	if ( !has_unique_rows(COMP , KEYS) ){
124	!	stop( "BY variables in COMPARE do not result in unique observations")
125		}
126
127
128
129		#### Check essential variable properties (class & mode)
130
131	129x	COMPARE[["UnsupportedColsBase"]] <- construct_issue(
132	129x	value = identify_unsupported_cols(BASE) ,
133	129x	message = "There are columns in BASE with unsupported modes !!"
134		)
135
136
137	129x	COMPARE[["UnsupportedColsComp"]] <- construct_issue(
138	129x	value = identify_unsupported_cols(COMP) ,
139	129x	message = "There are columns in COMPARE with unsupported modes !!"
140		)
141
142
143		# cast variables if strict is off
144	129x	if ( !strict_factor \| !strict_numeric ){
145
146	9x	casted_df <- cast_variables(
147	9x	BASE = BASE,
148	9x	COMPARE = COMP,
149	9x	ignore_vars = KEYS,
150	9x	cast_integers = !strict_numeric ,
151	9x	cast_factors = !strict_factor
152		)
153
154	9x	BASE <- casted_df$BASE
155	9x	COMP <- casted_df$COMP
156
157		}
158
159
160	129x	COMPARE[["VarModeDiffs"]] <- construct_issue(
161	129x	value = identify_mode_differences( BASE, COMP ) ,
162	129x	message = "There are columns in BASE and COMPARE with different modes !!"
163		)
164
165
166	129x	COMPARE[["VarClassDiffs"]] <- construct_issue(
167	129x	value = identify_class_differences(BASE, COMP) ,
168	129x	message = "There are columns in BASE and COMPARE with different classes !!"
169		)
170
171
172	129x	exclude_cols <- c(
173	129x	COMPARE[["UnsupportedColsBase"]]$VARIABLE ,
174	129x	COMPARE[["UnsupportedColsComp"]]$VARIABLE,
175	129x	COMPARE[["VarClassDiffs"]]$VARIABLE,
176	129x	COMPARE[["VarModeDiffs"]]$VARIABLE
177		)
178
179
180		##### Check Validity of Keys
181
182	129x	BASE_keys <- names(BASE)[names(BASE) %in% KEYS]
183	129x	COMP_keys <- names(COMP)[names(COMP) %in% KEYS]
184
185
186	129x	if ( length(BASE_keys) != length(KEYS) ){
187	!	stop( "BASE is missing variables specified in KEYS")
188		}
189
190	129x	if ( length(COMP_keys) != length(KEYS) ){
191	!	stop( "COMP is missing variables specified in KEYS")
192		}
193
194	129x	if( any(KEYS %in% exclude_cols)){
195	!	stop("KEYS are either an invalid or contain different modes between BASE and COMP")
196		}
197
198
199		##### Check Attributes
200
201
202	129x	COMPARE[["AttribDiffs"]] <- construct_issue(
203	129x	value = identify_att_differences(BASE, COMP , exclude_cols) ,
204	129x	message = "There are columns in BASE and COMPARE with differing attributes !!"
205		)
206
207
208		##### Check data
209
210	129x	BASE <- factor_to_character(BASE , KEYS)
211	129x	COMP <- factor_to_character(COMP , KEYS)
212
213
214	129x	COMPARE[["ExtRowsBase"]] <- construct_issue(
215	129x	value = identify_extra_rows( BASE, COMP, KEYS ) ,
216	129x	message = "There are rows in BASE that are not in COMPARE !!"
217		)
218
219
220	129x	COMPARE[["ExtRowsComp"]] <- construct_issue(
221	129x	value = identify_extra_rows( COMP, BASE, KEYS ) ,
222	129x	message = "There are rows in COMPARE that are not in BASE !!"
223		)
224
225
226
227	129x	COMPARE[["ExtColsBase"]] <- construct_issue(
228	129x	value = identify_extra_cols(BASE, COMP) ,
229	129x	message = "There are columns in BASE that are not in COMPARE !!"
230		)
231
232
233	129x	COMPARE[["ExtColsComp"]] <- construct_issue(
234	129x	value = identify_extra_cols(COMP, BASE) ,
235	129x	message = "There are columns in COMPARE that are not in BASE !!"
236		)
237
238
239	129x	VALUE_DIFFERENCES <- identify_differences(
240	129x	BASE, COMP , KEYS, exclude_cols,
241	129x	tolerance = tolerance,
242	129x	scale = scale
243		)
244
245
246
247		## Summarise the number of mismatching rows per variable
248
249	129x	if ( length(VALUE_DIFFERENCES) ){
250	128x	NDIFF <- sapply( VALUE_DIFFERENCES , nrow )
251	128x	COMPARE[["NumDiff"]] <- construct_issue(
252	128x	value = convert_to_issue(NDIFF),
253	128x	message = "Not all Values Compared Equal"
254		)
255		}
256
257
258	129x	for ( i in names(VALUE_DIFFERENCES) ){
259	991x	COMPARE[[ paste0( "VarDiff_", i)]] <- construct_issue(
260	991x	value = VALUE_DIFFERENCES[[i]] ,
261	991x	message = ""
262		)
263		}
264
265		## Get all issue messages, remove blank message, and collapse into single string
266	129x	ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x))
267	129x	ISSUE_MSGS <- ISSUE_MSGS[ ISSUE_MSGS != ""]
268
269	129x	if( length(ISSUE_MSGS) != 0 ){
270	95x	if(!SUPWARN) {
271	36x	ISSUE_MSGS <- paste(ISSUE_MSGS, collapse ='\n' )
272	36x	warning( c("\n" , ISSUE_MSGS))
273		}
274		}
275
276
277	129x	if (!is.null(file)){
278	!	x <- print(COMPARE , as_string = TRUE)
279
280	!	tryCatch(
281		{
282	!	sink(file)
283	!	cat(x, sep = "\n")
284	!	sink()
285		},
286	!	warning = function(w){
287	!	sink()
288	!	warning(w)
289		},
290	!	error = function(e){
291	!	sink()
292	!	stop(e)
293		}
294		)
295	!	return(invisible(COMPARE))
296
297		}
298
299	129x	return(COMPARE)
300		}
301
302
303
304
305		#' diffdf_has_issues
306		#'
307		#' Utility function which returns TRUE if an diffdf
308		#' object has issues or FALSE if an diffdf object does not have issues
309		#' @param x diffdf object
310		#' @examples
311		#'
312		#' # Example with no issues
313		#' x <- diffdf( iris, iris )
314		#' diffdf_has_issues(x)
315		#'
316		#' # Example with issues
317		#' iris2 <- iris
318		#' iris2[2,2] <- NA
319		#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE)
320		#' diffdf_has_issues(x)
321		#' @export
322		diffdf_has_issues <- function(x){
323	!	if ( class(x)[[1]] != "diffdf" ) stop( "x is not an diffdf object")
324	21x	return( length(x) != 0 )
325		}
326

1		#'generate_keyname
2		#'
3		#'Function to generate a name for the keys if not provided
4		#'
5		#'@param BASE base dataset
6		#'@param COMP comparison dataset
7		#'@param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity
8		#'
9		generate_keyname <- function(
10		BASE,
11		COMP,
12		replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
13		){
14
15	213x	if ( class(replace_names) != "character"){
16	!	stop( "replace_names is not a character vector")
17		}
18
19	213x	if (length(replace_names) == 0) {
20	8x	stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument")
21		}
22
23	205x	key_name <- replace_names[1]
24
25	205x	if (!is.null(BASE[[key_name]]) \| !is.null( COMP[[key_name]])){
26	66x	key_name <- generate_keyname(BASE, COMP, replace_names[-1])
27		}
28
29	173x	return(key_name)
30
31		}

1
2		#' sort_then_join
3		#'
4		#' Convenience function to sort two strings and paste them together
5		#' @param string1 first string
6		#' @param string2 second string
7		sort_then_join <- function(string1, string2){
8	14x	paste0(sort(c(string1, string2)), collapse = '')
9		}
10
11
12		#' class_merge
13		#'
14		#' Convenience function to put all classes an object has into one string
15		#' @param x an object
16		class_merge <- function(x){
17	304x	paste(class(x), collapse = "_")
18		}
19
20
21		get_message <- function( colname , whichdat, totype){
22	12x	message(paste0(
23	12x	"NOTE: Variable " , colname, " in " , tolower(whichdat) , " was casted to " , totype
24		))
25		}
26
27
28		#' get_casted_vector
29		#'
30		#' casts a vector depending on its type and input
31		#' @param colin column to cast
32		#' @param colname name of vector
33		#' @param whichdat whether base or compare is being casted (used for messages)
34		get_casted_vector <- function(colin, colname, whichdat){
35
36	24x	if ( class(colin) == "factor"){
37	6x	get_message( colname , whichdat , "character")
38	6x	return(as.character(colin))
39		}
40
41	18x	if ( class(colin) == "integer"){
42	6x	get_message( colname , whichdat , "numeric")
43	6x	return(as.numeric(colin))
44		}
45
46	12x	colin
47		}
48
49
50
51		#' get_casted_dataset
52		#'
53		#' Internal utility function to loop across a dataset casting all target
54		#' variables
55		#' @param df dataset to be casted
56		#' @param columns columns to be casted
57		#' @param whichdat whether base or compare is being casted (used for messages)
58		get_casted_dataset <- function(df , columns , whichdat){
59	22x	for ( col in columns ){
60	24x	df[[col]] <- get_casted_vector( df[[col]] , col , whichdat )
61		}
62	22x	return(df)
63		}
64
65
66
67
68
69
70		#' cast_variables
71		#'
72		#' Function to cast datasets columns if they have differing types
73		#' Restricted to specific cases, currently integer and double, and character and factor
74		#' @param BASE base dataset
75		#' @param COMPARE comparison dataset
76		#' @param ignore_vars Variables not to be considered for casting
77		#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles
78		#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters
79		cast_variables <- function(
80		BASE,
81		COMPARE,
82		ignore_vars = NULL,
83		cast_integers = FALSE ,
84		cast_factors = FALSE
85		){
86
87	13x	allowed_class_casts <- c( "integernumeric" , "characterfactor")[c(cast_integers, cast_factors)]
88
89	13x	BASE_class <- data.frame(
90	13x	class_BASE = sapply(BASE, class_merge),
91	13x	colname = names(BASE),
92	13x	stringsAsFactors = FALSE
93		)
94	13x	BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars,, drop=FALSE]
95
96
97	13x	COMPARE_class <- data.frame(
98	13x	class_COMPARE = sapply(COMPARE, class_merge),
99	13x	colname = names(COMPARE),
100	13x	stringsAsFactors = FALSE
101		)
102	13x	COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars ,, drop=FALSE]
103
104	13x	common_class <- merge(
105	13x	x = BASE_class,
106	13x	y = COMPARE_class,
107	13x	by = "colname"
108		)
109
110
111	13x	diff_class <- common_class[ common_class[["class_BASE"]] != common_class[["class_COMPARE"]] ,,drop=FALSE]
112
113
114	13x	diff_class$classmerge <- mapply(
115	13x	sort_then_join,
116	13x	diff_class$class_COMPARE,
117	13x	diff_class$class_BASE
118		)
119
120
121	13x	cast_columns <- diff_class[ diff_class[["classmerge"]] %in% allowed_class_casts ,,drop=FALSE]
122
123
124	13x	DATASETS <- list(
125	13x	"BASE" = BASE,
126	13x	"COMPARE" = COMPARE
127		)
128
129
130	13x	if( nrow(cast_columns) == 0 ){
131	2x	return( DATASETS )
132		}
133
134
135	11x	for ( i in names(DATASETS)){
136	22x	DATASETS[[i]] <- get_casted_dataset( DATASETS[[i]] , cast_columns$colname , i)
137		}
138
139
140	11x	return(DATASETS)
141		}
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173

1
2
3
4		#' factor_to_character
5		#'
6		#' Takes a dataframe and converts any factor variables to character
7		#' @param dsin input dataframe
8		#' @param vars variables to consider for conversion. Default NULL will consider
9		#' every variable within the dataset
10		factor_to_character <- function( dsin , vars = NULL){
11
12	!	if ( is.null(vars) ) vars = names(dsin)
13
14	258x	for (var in vars){
15	272x	if( is.factor(dsin[[var]])){
16	!	dsin[[var]] <- as.character(dsin[[var]])
17		}
18		}
19	258x	return(dsin)
20		}
21
22
23
24
25		#' has_unique_rows
26		#'
27		#' Check if a data sets rows are unique
28		#' @param DAT input data set (data frame)
29		#' @param KEYS Set of keys which should be unique
30		has_unique_rows <- function(DAT , KEYS){
31	259x	DUPS <- duplicated( subset(DAT , select= KEYS) )
32	259x	NDUPS <- sum( DUPS)
33	259x	return( NDUPS == 0 )
34		}
35
36		#'convert_to_issue
37		#'
38		#'converts the count value into the correct issue format
39		#'@param datin data inputted
40		#'@importFrom tibble rownames_to_column
41		convert_to_issue <- function(datin){
42	128x	datin_tibble <- tibble(
43	128x	`Variable` = names(datin),
44	128x	`No of Differences` = datin
45		)
46
47	128x	datin_tibble_reduced <- datin_tibble[ datin_tibble[["No of Differences"]] > 0, , drop = FALSE]
48	128x	return(datin_tibble_reduced)
49		}
50