| 1 | ||
| 2 | ||
| 3 |
#' identify_extra_rows |
|
| 4 |
#' |
|
| 5 |
#' Identifies rows that are in a baseline dataset but not in a comparator dataset |
|
| 6 |
#' @param DS1 Baseline dataset (data frame) |
|
| 7 |
#' @param DS2 Comparator dataset (data frame) |
|
| 8 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
| 9 |
identify_extra_rows <- function(DS1, DS2 , KEYS){
|
|
| 10 | 258x |
DS2[["..FLAG.."]] <- "Y" |
| 11 | 258x |
dat <- merge( |
| 12 | 258x |
subset( DS1 , select = KEYS) , |
| 13 | 258x |
subset( DS2 , select = c(KEYS, "..FLAG..")) , |
| 14 | 258x |
by = KEYS, all.x = T, |
| 15 | 258x |
sort = TRUE |
| 16 |
) |
|
| 17 | 258x |
dat <- dat[do.call("order", dat[KEYS]), ]
|
| 18 |
|
|
| 19 | 258x |
dat[ is.na(dat[["..FLAG.."]]) , KEYS, drop=FALSE] |
| 20 |
} |
|
| 21 | ||
| 22 | ||
| 23 | ||
| 24 |
#' identify_extra_cols |
|
| 25 |
#' |
|
| 26 |
#' Identifies columns that are in a baseline dataset but not in a comparator dataset |
|
| 27 |
#' @param DS1 Baseline dataset (data frame) |
|
| 28 |
#' @param DS2 Comparator dataset (data frame) |
|
| 29 |
#' @importFrom tibble tibble |
|
| 30 |
identify_extra_cols <- function(DS1 , DS2){
|
|
| 31 | 258x |
match.cols <- sapply ( names(DS1), "%in%", names(DS2)) |
| 32 | 258x |
if ( !all(is.logical(match.cols)) ){
|
| 33 | ! |
stop("Assumption of logical return type is not true")
|
| 34 |
} |
|
| 35 | 258x |
tibble( |
| 36 | 258x |
COLUMNS = names(DS1)[ !match.cols] |
| 37 |
) |
|
| 38 |
} |
|
| 39 | ||
| 40 | ||
| 41 | ||
| 42 | ||
| 43 | ||
| 44 | ||
| 45 | ||
| 46 |
#' identify_matching_cols |
|
| 47 |
#' |
|
| 48 |
#' Identifies columns with the same name in two data frames |
|
| 49 |
#' @param DS1 Input dataset 1 (data frame) |
|
| 50 |
#' @param DS2 Input dataset 2 (data frame) |
|
| 51 |
#' @param EXCLUDE Columns to ignore |
|
| 52 |
identify_matching_cols <- function(DS1, DS2 , EXCLUDE = ""){
|
|
| 53 | 826x |
match_cols <- sapply( names(DS1), "%in%" , names(DS2)) |
| 54 | 826x |
exclude_cols <- sapply( names(DS1), "%in%" , EXCLUDE) |
| 55 | 826x |
names(DS1)[ match_cols & !exclude_cols ] |
| 56 |
} |
|
| 57 | ||
| 58 | ||
| 59 | ||
| 60 | ||
| 61 | ||
| 62 |
#' identify_unsupported_cols |
|
| 63 |
#' |
|
| 64 |
#' Identifies any columns for which the package is not setup to handle |
|
| 65 |
#' @param dsin input dataset |
|
| 66 |
identify_unsupported_cols <- function(dsin){
|
|
| 67 | 258x |
dat <- subset( |
| 68 | 258x |
identify_properties(dsin) , |
| 69 | 258x |
select = c("VARIABLE", "MODE")
|
| 70 |
) |
|
| 71 |
|
|
| 72 | 258x |
dat[ !dat[["MODE"]] %in% c('numeric', 'character', 'logical') ,, drop=FALSE]
|
| 73 | ||
| 74 |
} |
|
| 75 | ||
| 76 | ||
| 77 | ||
| 78 |
#' identify_mode_differences |
|
| 79 |
#' |
|
| 80 |
#' Identifies any mode differences between two data frames |
|
| 81 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 82 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 83 |
identify_mode_differences <- function( BASE, COMP ){
|
|
| 84 | ||
| 85 | 429x |
matching_cols <- identify_matching_cols( BASE , COMP ) |
| 86 |
|
|
| 87 | 429x |
dat <- merge( |
| 88 | 429x |
x = identify_properties(BASE), |
| 89 | 429x |
y = identify_properties(COMP), |
| 90 | 429x |
by = "VARIABLE", |
| 91 | 429x |
all = TRUE, |
| 92 | 429x |
suffixes = c(".BASE", ".COMP"),
|
| 93 | 429x |
sort = TRUE |
| 94 |
) |
|
| 95 | 429x |
dat <- subset( dat, select = c("VARIABLE" , "MODE.BASE" , "MODE.COMP"))
|
| 96 |
|
|
| 97 | 429x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
| 98 | 429x |
KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]] |
| 99 |
|
|
| 100 | 429x |
dat[ KEEP1 & KEEP2 ,, drop=FALSE] |
| 101 | ||
| 102 |
} |
|
| 103 | ||
| 104 | ||
| 105 | ||
| 106 |
#' identify_class_differences |
|
| 107 |
#' |
|
| 108 |
#' Identifies any class differences between two data frames |
|
| 109 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 110 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 111 |
identify_class_differences <- function( BASE, COMP ){
|
|
| 112 |
|
|
| 113 | 129x |
matching_cols <- identify_matching_cols( BASE , COMP ) |
| 114 |
|
|
| 115 | 129x |
dat <- merge( |
| 116 | 129x |
x = identify_properties(BASE), |
| 117 | 129x |
y = identify_properties(COMP), |
| 118 | 129x |
by = "VARIABLE", |
| 119 | 129x |
all = TRUE, |
| 120 | 129x |
sort = TRUE, |
| 121 | 129x |
suffixes = c(".BASE", ".COMP")
|
| 122 |
) |
|
| 123 |
|
|
| 124 | 129x |
dat <- subset( dat , select = c("VARIABLE" , "CLASS.BASE" , "CLASS.COMP"))
|
| 125 |
|
|
| 126 | 129x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
| 127 | 129x |
KEEP2 <- !mapply( |
| 128 | 129x |
identical, |
| 129 | 129x |
dat[["CLASS.BASE"]] , |
| 130 | 129x |
dat[["CLASS.COMP"]] |
| 131 |
) |
|
| 132 |
|
|
| 133 | 129x |
dat[ KEEP1 & KEEP2 ,, drop=FALSE] |
| 134 | ||
| 135 |
} |
|
| 136 | ||
| 137 | ||
| 138 | ||
| 139 |
#' identify_att_differences |
|
| 140 |
#' |
|
| 141 |
#' Identifies any attribute differences between two data frames |
|
| 142 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 143 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 144 |
#' @param exclude_cols Columns to exclude from comparison |
|
| 145 |
#' @importFrom tibble tibble |
|
| 146 |
identify_att_differences <- function( BASE, COMP , exclude_cols = "" ){
|
|
| 147 |
|
|
| 148 | 139x |
matching_cols <- identify_matching_cols( BASE , COMP , exclude_cols ) |
| 149 |
|
|
| 150 | 139x |
PROPS <- merge( |
| 151 | 139x |
x = identify_properties(BASE) , |
| 152 | 139x |
y = identify_properties(COMP) , |
| 153 | 139x |
by = "VARIABLE", |
| 154 | 139x |
all = TRUE, |
| 155 | 139x |
sort = TRUE, |
| 156 | 139x |
suffixes = c(".BASE", ".COMP")
|
| 157 |
) |
|
| 158 |
|
|
| 159 | 139x |
PROPS <- subset( PROPS , select = c("VARIABLE", "ATTRIBS.BASE" , "ATTRIBS.COMP"))
|
| 160 |
|
|
| 161 | 139x |
PROPS <- PROPS[ PROPS[["VARIABLE"]] %in% matching_cols,, drop = FALSE] |
| 162 |
|
|
| 163 |
|
|
| 164 |
### Setup dummy return value |
|
| 165 | 139x |
RETURN <- tibble( |
| 166 | 139x |
VARIABLE = character(), |
| 167 | 139x |
ATTR_NAME = character(), |
| 168 | 139x |
VALUES.BASE = list(), |
| 169 | 139x |
VALUES.COMP = list() |
| 170 |
) |
|
| 171 |
|
|
| 172 | 139x |
for ( i in PROPS[["VARIABLE"]] ){
|
| 173 |
|
|
| 174 | 1237x |
PROPS_filt <- PROPS[ PROPS[["VARIABLE"]] == i ,, drop=FALSE] |
| 175 | ||
| 176 |
### Get a vector of all available attributes across both variables |
|
| 177 | 1237x |
ATTRIB_NAMES = unique(c( |
| 178 | 1237x |
names(PROPS_filt[["ATTRIBS.BASE"]][[1]]) , |
| 179 | 1237x |
names(PROPS_filt[["ATTRIBS.COMP"]][[1]]) |
| 180 |
)) |
|
| 181 |
|
|
| 182 |
### If variable has no attributes move onto the next variable |
|
| 183 | 904x |
if ( is.null(ATTRIB_NAMES) ) next() |
| 184 |
|
|
| 185 |
### Loop over each attribute checking if they are identical and outputing |
|
| 186 |
### anyones that arn't |
|
| 187 | 333x |
for ( j in ATTRIB_NAMES){
|
| 188 |
|
|
| 189 | 574x |
ATTRIB_BASE = PROPS_filt[["ATTRIBS.BASE"]][[1]][j] |
| 190 | 574x |
ATTRIB_COMP = PROPS_filt[["ATTRIBS.COMP"]][[1]][j] |
| 191 |
|
|
| 192 | 574x |
if ( !identical(ATTRIB_BASE , ATTRIB_COMP) ){
|
| 193 |
|
|
| 194 | 89x |
ATT_DIFFS <- tibble( |
| 195 | 89x |
VARIABLE = i , |
| 196 | 89x |
ATTR_NAME = j , |
| 197 | 89x |
VALUES.BASE = ifelse( is.null(ATTRIB_BASE) , list() , ATTRIB_BASE), |
| 198 | 89x |
VALUES.COMP = ifelse( is.null(ATTRIB_COMP) , list() , ATTRIB_COMP) |
| 199 |
) |
|
| 200 |
|
|
| 201 | 89x |
RETURN <- rbind(RETURN , ATT_DIFFS) |
| 202 |
} |
|
| 203 |
} |
|
| 204 |
} |
|
| 205 | 139x |
return(RETURN) |
| 206 |
} |
|
| 207 | ||
| 208 | ||
| 209 | ||
| 210 | ||
| 211 | ||
| 212 |
#' identify_differences |
|
| 213 |
#' |
|
| 214 |
#' Compares each column within 2 datasets to identify any values which they |
|
| 215 |
#' mismatch on. |
|
| 216 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 217 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 218 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
| 219 |
#' @param exclude_cols Columns to exclude from comparison |
|
| 220 |
#' @param tolerance Level of tolerance for numeric differences between two variables |
|
| 221 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
| 222 |
identify_differences <- function( BASE , COMP , KEYS, exclude_cols, |
|
| 223 |
tolerance = sqrt(.Machine$double.eps), |
|
| 224 |
scale = NULL ) {
|
|
| 225 |
|
|
| 226 | 129x |
matching_cols <- identify_matching_cols( BASE , COMP , c(KEYS, exclude_cols)) |
| 227 |
|
|
| 228 | 1x |
if( length(matching_cols) == 0 ) return ( tibble() ) |
| 229 |
|
|
| 230 | 128x |
DAT = merge( |
| 231 | 128x |
x = BASE , |
| 232 | 128x |
y = COMP , |
| 233 | 128x |
by = KEYS , |
| 234 | 128x |
suffix = c(".x", ".y"),
|
| 235 | 128x |
sort = TRUE |
| 236 |
) |
|
| 237 | 128x |
DAT <- DAT[do.call("order", DAT[KEYS]), ]
|
| 238 |
|
|
| 239 | ||
| 240 | 128x |
matching_list <- mapply( |
| 241 | 128x |
is_variable_different , |
| 242 | 128x |
matching_cols, |
| 243 | 128x |
MoreArgs = list( |
| 244 | 128x |
keynames = KEYS, |
| 245 | 128x |
datain = DAT, |
| 246 | 128x |
tolerance = tolerance , |
| 247 | 128x |
scale = scale |
| 248 |
), |
|
| 249 | 128x |
SIMPLIFY = FALSE |
| 250 |
) |
|
| 251 |
|
|
| 252 | 128x |
matching_list |
| 253 |
} |
|
| 254 | ||
| 255 | ||
| 256 | ||
| 257 | ||
| 258 | ||
| 259 | ||
| 260 | ||
| 261 | ||
| 262 | ||
| 263 |
#' identify_properties |
|
| 264 |
#' |
|
| 265 |
#' Returns a dataframe of metadata for a given dataset. |
|
| 266 |
#' Returned values include variable names , class , mode , type & attributes |
|
| 267 |
#' @param dsin input dataframe that you want to get the metadata from |
|
| 268 |
#' @importFrom tibble tibble |
|
| 269 |
identify_properties <- function(dsin){
|
|
| 270 |
|
|
| 271 |
### If missing or null return empty dataset |
|
| 272 | 1652x |
if( is.null(dsin) ) {
|
| 273 | ! |
x <- tibble( |
| 274 | ! |
VARIABLE = character(), |
| 275 | ! |
CLASS = list(), |
| 276 | ! |
MODE = character(), |
| 277 | ! |
TYPE = character() , |
| 278 | ! |
ATTRIBS = list() |
| 279 |
) |
|
| 280 | ! |
return(x) |
| 281 |
} |
|
| 282 |
|
|
| 283 | 1652x |
tibble( |
| 284 | 1652x |
VARIABLE = names(dsin), |
| 285 | 1652x |
CLASS = lapply(dsin, class), |
| 286 | 1652x |
MODE = sapply(dsin , mode), |
| 287 | 1652x |
TYPE = sapply(dsin , typeof) , |
| 288 | 1652x |
ATTRIBS = lapply( dsin , attributes) |
| 289 |
) |
|
| 290 |
} |
|
| 291 | ||
| 292 | ||
| 293 | ||
| 294 | ||
| 295 | ||
| 296 |
| 1 |
#' diffdf |
|
| 2 |
#' @description |
|
| 3 |
#' Compares 2 dataframes and outputs any differences. |
|
| 4 |
#' @param base input dataframe |
|
| 5 |
#' @param compare comparison dataframe |
|
| 6 |
#' @param keys vector of variables (as strings) that defines a unique row in the base and compare dataframes |
|
| 7 |
#' @param strict_numeric Flag for strict numeric to numeric comparisons (default = TRUE). If False diffdf will cast integer to double where required for comparisons. Note that variables specified in the keys will never be casted. |
|
| 8 |
#' @param strict_factor Flag for strict factor to character comparisons (default = TRUE). If False diffdf will cast factors to characters where required for comparisons. Note that variables specified in the keys will never be casted. |
|
| 9 |
#' @param suppress_warnings Do you want to suppress warnings? (logical) |
|
| 10 |
#' @param file Location and name of a text file to output the results to. Setting to NULL will cause no file to be produced. |
|
| 11 |
#' @param tolerance Set tolerance for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance. |
|
| 12 |
#' @param scale Set scale for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient version of scale = 1. |
|
| 13 |
#' @examples |
|
| 14 |
#' x <- subset( iris, -Species) |
|
| 15 |
#' x[1,2] <- 5 |
|
| 16 |
#' COMPARE <- diffdf( iris, x) |
|
| 17 |
#' print( COMPARE ) |
|
| 18 |
#' print( COMPARE , "Sepal.Length" ) |
|
| 19 |
#' |
|
| 20 |
#' #### Sample data frames |
|
| 21 |
#' |
|
| 22 |
#' DF1 <- data.frame( |
|
| 23 |
#' id = c(1,2,3,4,5,6), |
|
| 24 |
#' v1 = letters[1:6], |
|
| 25 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA) |
|
| 26 |
#' ) |
|
| 27 |
#' |
|
| 28 |
#' DF2 <- data.frame( |
|
| 29 |
#' id = c(1,2,3,4,5,7), |
|
| 30 |
#' v1 = letters[1:6], |
|
| 31 |
#' v2 = c(NA , NA , 1 , 2 , NA , NA), |
|
| 32 |
#' v3 = c(NA , NA , 1 , 2 , NA , 4) |
|
| 33 |
#' ) |
|
| 34 |
#' |
|
| 35 |
#' diffdf(DF1 , DF1 , keys = "id") |
|
| 36 |
#' |
|
| 37 |
#' # We can control matching with scale/location for example: |
|
| 38 |
#' |
|
| 39 |
#' DF1 <- data.frame( |
|
| 40 |
#' id = c(1,2,3,4,5,6), |
|
| 41 |
#' v1 = letters[1:6], |
|
| 42 |
#' v2 = c(1,2,3,4,5,6) |
|
| 43 |
#' ) |
|
| 44 |
#' DF2 <- data.frame( |
|
| 45 |
#' id = c(1,2,3,4,5,6), |
|
| 46 |
#' v1 = letters[1:6], |
|
| 47 |
#' v2 = c(1.1,2,3,4,5,6) |
|
| 48 |
#' ) |
|
| 49 |
#' |
|
| 50 |
#' diffdf(DF1 , DF2 , keys = "id") |
|
| 51 |
#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2) |
|
| 52 |
#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2) |
|
| 53 |
#' |
|
| 54 |
#' # We can use strict_factor to compare factors with characters for example: |
|
| 55 |
#' |
|
| 56 |
#' DF1 <- data.frame( |
|
| 57 |
#' id = c(1,2,3,4,5,6), |
|
| 58 |
#' v1 = letters[1:6], |
|
| 59 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA), |
|
| 60 |
#' stringsAsFactors = FALSE |
|
| 61 |
#' ) |
|
| 62 |
#' |
|
| 63 |
#' DF2 <- data.frame( |
|
| 64 |
#' id = c(1,2,3,4,5,6), |
|
| 65 |
#' v1 = letters[1:6], |
|
| 66 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA) |
|
| 67 |
#' ) |
|
| 68 |
#' |
|
| 69 |
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE) |
|
| 70 |
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE) |
|
| 71 |
#' |
|
| 72 |
#' @export |
|
| 73 |
diffdf <- function ( |
|
| 74 |
base , |
|
| 75 |
compare , |
|
| 76 |
keys = NULL, |
|
| 77 |
suppress_warnings = FALSE, |
|
| 78 |
strict_numeric = TRUE, |
|
| 79 |
strict_factor = TRUE, |
|
| 80 |
file = NULL, |
|
| 81 |
tolerance = sqrt(.Machine$double.eps), |
|
| 82 |
scale = NULL |
|
| 83 |
){
|
|
| 84 |
|
|
| 85 | 134x |
BASE = base |
| 86 | 134x |
COMP = compare |
| 87 | 134x |
KEYS = keys |
| 88 | 134x |
SUPWARN = suppress_warnings |
| 89 |
|
|
| 90 |
|
|
| 91 |
### Initatiate output object |
|
| 92 | 134x |
COMPARE <- list() |
| 93 | 134x |
class(COMPARE) <- c("diffdf" , "list")
|
| 94 |
|
|
| 95 | 134x |
is_derived <- FALSE |
| 96 |
|
|
| 97 |
### If no key is suplied match values based upon row number |
|
| 98 | 134x |
if (is.null(KEYS)){
|
| 99 | 122x |
is_derived <- TRUE |
| 100 | 122x |
keyname <- generate_keyname(BASE, COMP) |
| 101 | 122x |
BASE[[keyname]] <- 1:nrow(BASE) |
| 102 | 122x |
COMP[[keyname]] <- 1:nrow(COMP) |
| 103 | 122x |
KEYS <- keyname |
| 104 |
} |
|
| 105 | 134x |
attr(COMPARE, 'keys') <- list(value = KEYS, is_derived = is_derived) |
| 106 |
|
|
| 107 |
|
|
| 108 |
|
|
| 109 | 134x |
if (!is.numeric(tolerance)) {
|
| 110 | 2x |
stop("'tolerance' should be numeric")
|
| 111 |
} |
|
| 112 |
|
|
| 113 | 132x |
if (!is.numeric(scale) && !is.null(scale)) {
|
| 114 | 2x |
stop("'scale' should be numeric or NULL")
|
| 115 |
} |
|
| 116 |
|
|
| 117 | ||
| 118 |
|
|
| 119 | 130x |
if ( !has_unique_rows(BASE , KEYS) ){
|
| 120 | 1x |
stop( "BY variables in BASE do not result in unique observations") |
| 121 |
} |
|
| 122 |
|
|
| 123 | 129x |
if ( !has_unique_rows(COMP , KEYS) ){
|
| 124 | ! |
stop( "BY variables in COMPARE do not result in unique observations") |
| 125 |
} |
|
| 126 |
|
|
| 127 |
|
|
| 128 | ||
| 129 |
#### Check essential variable properties (class & mode) |
|
| 130 |
|
|
| 131 | 129x |
COMPARE[["UnsupportedColsBase"]] <- construct_issue( |
| 132 | 129x |
value = identify_unsupported_cols(BASE) , |
| 133 | 129x |
message = "There are columns in BASE with unsupported modes !!" |
| 134 |
) |
|
| 135 |
|
|
| 136 |
|
|
| 137 | 129x |
COMPARE[["UnsupportedColsComp"]] <- construct_issue( |
| 138 | 129x |
value = identify_unsupported_cols(COMP) , |
| 139 | 129x |
message = "There are columns in COMPARE with unsupported modes !!" |
| 140 |
) |
|
| 141 | ||
| 142 |
|
|
| 143 |
# cast variables if strict is off |
|
| 144 | 129x |
if ( !strict_factor | !strict_numeric ){
|
| 145 |
|
|
| 146 | 9x |
casted_df <- cast_variables( |
| 147 | 9x |
BASE = BASE, |
| 148 | 9x |
COMPARE = COMP, |
| 149 | 9x |
ignore_vars = KEYS, |
| 150 | 9x |
cast_integers = !strict_numeric , |
| 151 | 9x |
cast_factors = !strict_factor |
| 152 |
) |
|
| 153 |
|
|
| 154 | 9x |
BASE <- casted_df$BASE |
| 155 | 9x |
COMP <- casted_df$COMP |
| 156 |
|
|
| 157 |
} |
|
| 158 |
|
|
| 159 |
|
|
| 160 | 129x |
COMPARE[["VarModeDiffs"]] <- construct_issue( |
| 161 | 129x |
value = identify_mode_differences( BASE, COMP ) , |
| 162 | 129x |
message = "There are columns in BASE and COMPARE with different modes !!" |
| 163 |
) |
|
| 164 |
|
|
| 165 |
|
|
| 166 | 129x |
COMPARE[["VarClassDiffs"]] <- construct_issue( |
| 167 | 129x |
value = identify_class_differences(BASE, COMP) , |
| 168 | 129x |
message = "There are columns in BASE and COMPARE with different classes !!" |
| 169 |
) |
|
| 170 |
|
|
| 171 | ||
| 172 | 129x |
exclude_cols <- c( |
| 173 | 129x |
COMPARE[["UnsupportedColsBase"]]$VARIABLE , |
| 174 | 129x |
COMPARE[["UnsupportedColsComp"]]$VARIABLE, |
| 175 | 129x |
COMPARE[["VarClassDiffs"]]$VARIABLE, |
| 176 | 129x |
COMPARE[["VarModeDiffs"]]$VARIABLE |
| 177 |
) |
|
| 178 |
|
|
| 179 | ||
| 180 |
##### Check Validity of Keys |
|
| 181 |
|
|
| 182 | 129x |
BASE_keys <- names(BASE)[names(BASE) %in% KEYS] |
| 183 | 129x |
COMP_keys <- names(COMP)[names(COMP) %in% KEYS] |
| 184 |
|
|
| 185 |
|
|
| 186 | 129x |
if ( length(BASE_keys) != length(KEYS) ){
|
| 187 | ! |
stop( "BASE is missing variables specified in KEYS") |
| 188 |
} |
|
| 189 |
|
|
| 190 | 129x |
if ( length(COMP_keys) != length(KEYS) ){
|
| 191 | ! |
stop( "COMP is missing variables specified in KEYS") |
| 192 |
} |
|
| 193 |
|
|
| 194 | 129x |
if( any(KEYS %in% exclude_cols)){
|
| 195 | ! |
stop("KEYS are either an invalid or contain different modes between BASE and COMP")
|
| 196 |
} |
|
| 197 |
|
|
| 198 | ||
| 199 |
##### Check Attributes |
|
| 200 | ||
| 201 |
|
|
| 202 | 129x |
COMPARE[["AttribDiffs"]] <- construct_issue( |
| 203 | 129x |
value = identify_att_differences(BASE, COMP , exclude_cols) , |
| 204 | 129x |
message = "There are columns in BASE and COMPARE with differing attributes !!" |
| 205 |
) |
|
| 206 |
|
|
| 207 |
|
|
| 208 |
##### Check data |
|
| 209 |
|
|
| 210 | 129x |
BASE <- factor_to_character(BASE , KEYS) |
| 211 | 129x |
COMP <- factor_to_character(COMP , KEYS) |
| 212 |
|
|
| 213 |
|
|
| 214 | 129x |
COMPARE[["ExtRowsBase"]] <- construct_issue( |
| 215 | 129x |
value = identify_extra_rows( BASE, COMP, KEYS ) , |
| 216 | 129x |
message = "There are rows in BASE that are not in COMPARE !!" |
| 217 |
) |
|
| 218 |
|
|
| 219 |
|
|
| 220 | 129x |
COMPARE[["ExtRowsComp"]] <- construct_issue( |
| 221 | 129x |
value = identify_extra_rows( COMP, BASE, KEYS ) , |
| 222 | 129x |
message = "There are rows in COMPARE that are not in BASE !!" |
| 223 |
) |
|
| 224 |
|
|
| 225 |
|
|
| 226 |
|
|
| 227 | 129x |
COMPARE[["ExtColsBase"]] <- construct_issue( |
| 228 | 129x |
value = identify_extra_cols(BASE, COMP) , |
| 229 | 129x |
message = "There are columns in BASE that are not in COMPARE !!" |
| 230 |
) |
|
| 231 |
|
|
| 232 |
|
|
| 233 | 129x |
COMPARE[["ExtColsComp"]] <- construct_issue( |
| 234 | 129x |
value = identify_extra_cols(COMP, BASE) , |
| 235 | 129x |
message = "There are columns in COMPARE that are not in BASE !!" |
| 236 |
) |
|
| 237 |
|
|
| 238 |
|
|
| 239 | 129x |
VALUE_DIFFERENCES <- identify_differences( |
| 240 | 129x |
BASE, COMP , KEYS, exclude_cols, |
| 241 | 129x |
tolerance = tolerance, |
| 242 | 129x |
scale = scale |
| 243 |
) |
|
| 244 | ||
| 245 |
|
|
| 246 |
|
|
| 247 |
## Summarise the number of mismatching rows per variable |
|
| 248 | ||
| 249 | 129x |
if ( length(VALUE_DIFFERENCES) ){
|
| 250 | 128x |
NDIFF <- sapply( VALUE_DIFFERENCES , nrow ) |
| 251 | 128x |
COMPARE[["NumDiff"]] <- construct_issue( |
| 252 | 128x |
value = convert_to_issue(NDIFF), |
| 253 | 128x |
message = "Not all Values Compared Equal" |
| 254 |
) |
|
| 255 |
} |
|
| 256 | ||
| 257 | ||
| 258 | 129x |
for ( i in names(VALUE_DIFFERENCES) ){
|
| 259 | 991x |
COMPARE[[ paste0( "VarDiff_", i)]] <- construct_issue( |
| 260 | 991x |
value = VALUE_DIFFERENCES[[i]] , |
| 261 | 991x |
message = "" |
| 262 |
) |
|
| 263 |
} |
|
| 264 |
|
|
| 265 |
## Get all issue messages, remove blank message, and collapse into single string |
|
| 266 | 129x |
ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x)) |
| 267 | 129x |
ISSUE_MSGS <- ISSUE_MSGS[ ISSUE_MSGS != ""] |
| 268 |
|
|
| 269 | 129x |
if( length(ISSUE_MSGS) != 0 ){
|
| 270 | 95x |
if(!SUPWARN) {
|
| 271 | 36x |
ISSUE_MSGS <- paste(ISSUE_MSGS, collapse ='\n' ) |
| 272 | 36x |
warning( c("\n" , ISSUE_MSGS))
|
| 273 |
} |
|
| 274 |
} |
|
| 275 |
|
|
| 276 |
|
|
| 277 | 129x |
if (!is.null(file)){
|
| 278 | ! |
x <- print(COMPARE , as_string = TRUE) |
| 279 |
|
|
| 280 | ! |
tryCatch( |
| 281 |
{
|
|
| 282 | ! |
sink(file) |
| 283 | ! |
cat(x, sep = "\n") |
| 284 | ! |
sink() |
| 285 |
}, |
|
| 286 | ! |
warning = function(w){
|
| 287 | ! |
sink() |
| 288 | ! |
warning(w) |
| 289 |
}, |
|
| 290 | ! |
error = function(e){
|
| 291 | ! |
sink() |
| 292 | ! |
stop(e) |
| 293 |
} |
|
| 294 |
) |
|
| 295 | ! |
return(invisible(COMPARE)) |
| 296 |
|
|
| 297 |
} |
|
| 298 |
|
|
| 299 | 129x |
return(COMPARE) |
| 300 |
} |
|
| 301 | ||
| 302 | ||
| 303 | ||
| 304 | ||
| 305 |
#' diffdf_has_issues |
|
| 306 |
#' |
|
| 307 |
#' Utility function which returns TRUE if an diffdf |
|
| 308 |
#' object has issues or FALSE if an diffdf object does not have issues |
|
| 309 |
#' @param x diffdf object |
|
| 310 |
#' @examples |
|
| 311 |
#' |
|
| 312 |
#' # Example with no issues |
|
| 313 |
#' x <- diffdf( iris, iris ) |
|
| 314 |
#' diffdf_has_issues(x) |
|
| 315 |
#' |
|
| 316 |
#' # Example with issues |
|
| 317 |
#' iris2 <- iris |
|
| 318 |
#' iris2[2,2] <- NA |
|
| 319 |
#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE) |
|
| 320 |
#' diffdf_has_issues(x) |
|
| 321 |
#' @export |
|
| 322 |
diffdf_has_issues <- function(x){
|
|
| 323 | ! |
if ( class(x)[[1]] != "diffdf" ) stop( "x is not an diffdf object") |
| 324 | 21x |
return( length(x) != 0 ) |
| 325 |
} |
|
| 326 |
| 1 | ||
| 2 | ||
| 3 | ||
| 4 | ||
| 5 | ||
| 6 |
#' diffdf_issuerows |
|
| 7 |
#' |
|
| 8 |
#' This function takes a diffdf object and a dataframe and subsets |
|
| 9 |
#' the dataframe for problem rows as identified in the comparison object. |
|
| 10 |
#' If \code{vars} has been specified only issue rows associated with those
|
|
| 11 |
#' variable(s) will be returned. |
|
| 12 |
#' @param df dataframe to be subsetted |
|
| 13 |
#' @param diff diffdf object |
|
| 14 |
#' @param vars (optional) character vector containing names of issue variables to subset dataframe |
|
| 15 |
#' on. A value of NULL (default) will be taken to mean available issue variables. |
|
| 16 |
#' @examples |
|
| 17 |
#' iris2 <- iris |
|
| 18 |
#' for ( i in 1:3) iris2[i,i] <- 99 |
|
| 19 |
#' x <- diffdf( iris , iris2, suppress_warnings = TRUE) |
|
| 20 |
#' diffdf_issuerows( iris , x) |
|
| 21 |
#' diffdf_issuerows( iris2 , x) |
|
| 22 |
#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length") |
|
| 23 |
#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width"))
|
|
| 24 |
#' @details |
|
| 25 |
#' Note that diffdf_issuerows can be used to subset against any dataframe. The only |
|
| 26 |
#' requirement is that the original variables specified in the keys argument to diffdf |
|
| 27 |
#' are present on the dataframe you are subsetting against. However please note that if |
|
| 28 |
#' no keys were specified in diffdf then the row number is used. This means using |
|
| 29 |
#' diffdf_issuerows without a keys against an arbitrary dataset can easily result in |
|
| 30 |
#' nonsense rows being returned. It is always recommended to supply keys to diffdf. |
|
| 31 |
#' @export |
|
| 32 |
diffdf_issuerows <- function( df , diff, vars = NULL){
|
|
| 33 | ||
| 34 | 19x |
if ( class(diff)[[1]] != "diffdf") {
|
| 35 | ! |
stop("diff should be an diffdf object")
|
| 36 |
} |
|
| 37 |
|
|
| 38 | 19x |
KEYS_ATT = attr(diff, "keys") |
| 39 |
|
|
| 40 | 19x |
if( is.null(KEYS_ATT) ) {
|
| 41 | ! |
stop("diff is missing the keys attribute")
|
| 42 |
} |
|
| 43 |
|
|
| 44 | 19x |
issue_vars <- names(diff)[grep( "^VarDiff_", names(diff))] |
| 45 |
|
|
| 46 | 19x |
if ( is.null(vars)){
|
| 47 | 15x |
vars <- issue_vars |
| 48 |
} else {
|
|
| 49 | 4x |
vars <- paste0("VarDiff_" , vars)
|
| 50 |
} |
|
| 51 |
|
|
| 52 | 19x |
if ( length(issue_vars) == 0 | sum( vars %in% issue_vars) == 0){
|
| 53 | 7x |
return(df[FALSE,]) |
| 54 |
} |
|
| 55 |
|
|
| 56 | 12x |
KEEP <- mapply( |
| 57 | 12x |
FUN = get_issue_dataset, |
| 58 | 12x |
issue = vars, |
| 59 | 12x |
diff = list(diff) , |
| 60 | 12x |
SIMPLIFY = F |
| 61 |
) |
|
| 62 |
|
|
| 63 | 12x |
KEEP <- recursive_reduce( KEEP , rbind) |
| 64 | 12x |
KEEP <- KEEP[!duplicated(KEEP),] |
| 65 |
|
|
| 66 | 12x |
if ( KEYS_ATT$is_derived ){
|
| 67 | 2x |
df[[KEYS_ATT$value]] <- 1:nrow(df) |
| 68 |
} |
|
| 69 | ||
| 70 | 12x |
keys <- KEYS_ATT$value |
| 71 |
|
|
| 72 | 12x |
if ( any( ! keys %in% names(df))){
|
| 73 | 1x |
stop("df does not contain all variables specified as keys in diff")
|
| 74 |
} |
|
| 75 |
|
|
| 76 | 11x |
RET <- merge( |
| 77 | 11x |
x = df, |
| 78 | 11x |
y = KEEP, |
| 79 | 11x |
sort = TRUE |
| 80 |
) |
|
| 81 |
|
|
| 82 | 11x |
RET <- RET[do.call("order", RET[keys]), ]
|
| 83 |
|
|
| 84 | 11x |
if ( KEYS_ATT$is_derived ){
|
| 85 | 2x |
keep_vars <- !names(RET) %in% KEYS_ATT$value |
| 86 | 2x |
RET <- RET[, keep_vars , drop = FALSE] |
| 87 |
} |
|
| 88 |
|
|
| 89 | 11x |
return(RET) |
| 90 |
} |
|
| 91 | ||
| 92 | ||
| 93 | ||
| 94 | ||
| 95 |
#' get_issue_dataset |
|
| 96 |
#' |
|
| 97 |
#' Internal function used by diffdf_issuerows to extract the dataframe |
|
| 98 |
#' from each a target issue. In particular it also strips off any |
|
| 99 |
#' non-key variables |
|
| 100 |
#' @param issue name of issue to extract the dataset from diff |
|
| 101 |
#' @param diff diffdf object which contains issues |
|
| 102 |
get_issue_dataset <- function(issue, diff){
|
|
| 103 | 20x |
issue_df <- diff[[issue]] |
| 104 | 20x |
keep <- names(issue_df)[ !(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
|
| 105 | 20x |
issue_df[,keep , drop=FALSE] |
| 106 |
} |
| 1 | ||
| 2 | ||
| 3 |
#' string_pad |
|
| 4 |
#' |
|
| 5 |
#' Utility function used to replicate str_pad. Adds white space to either end |
|
| 6 |
#' of a string to get it to equal the desired length |
|
| 7 |
#' @param x string |
|
| 8 |
#' @param width desired length |
|
| 9 |
string_pad <- function(x , width){
|
|
| 10 | ! |
if ( nchar(x) >= width) return(x) |
| 11 | 814x |
width <- width - nchar(x) |
| 12 | 814x |
left <- paste0( rep ( " " , floor( width/2) ) , collapse= "") |
| 13 | 814x |
right <- paste0( rep ( " " , ceiling( width/2) ) , collapse= "") |
| 14 | 814x |
paste0( left , x ,right , collapse = "") |
| 15 |
} |
|
| 16 | ||
| 17 | ||
| 18 |
#' recursive_reduce |
|
| 19 |
#' |
|
| 20 |
#' Utility function used to replicated purrr::reduce. Recursively applies a |
|
| 21 |
#' function to a list of elements until only 1 element remains |
|
| 22 |
#' @param .l list of values to apply a function to |
|
| 23 |
#' @param .f function to apply to each each element of the list in turn |
|
| 24 |
#' i.e. .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]]) |
|
| 25 |
recursive_reduce <- function(.l , .f){
|
|
| 26 | 557x |
if (length(.l) != 1){
|
| 27 | 377x |
.l[[2]] <- .f( .l[[1]] , .l[[2]]) |
| 28 | 377x |
return( recursive_reduce( .l[-1] , .f)) |
| 29 |
} else {
|
|
| 30 | 180x |
return(.l[[1]]) |
| 31 |
} |
|
| 32 |
} |
|
| 33 | ||
| 34 |
#' invert |
|
| 35 |
#' |
|
| 36 |
#' Utility function used to replicated purrr::transpose. Turns a list inside |
|
| 37 |
#' out. |
|
| 38 |
#' @param x list |
|
| 39 |
invert <- function(x){
|
|
| 40 | 56x |
x2 <- list() |
| 41 | 56x |
cnames <- names(x) |
| 42 | 56x |
tnames <- names(x[[1]]) |
| 43 | 56x |
for ( i in tnames ){
|
| 44 | 168x |
x2[[i]] <- list() |
| 45 | 168x |
for (j in cnames){
|
| 46 | 537x |
x2[[i]][[j]] <- x[[j]][[i]] |
| 47 |
} |
|
| 48 |
} |
|
| 49 | 56x |
return(x2) |
| 50 |
} |
|
| 51 | ||
| 52 | ||
| 53 | ||
| 54 | ||
| 55 |
#' as_ascii_table |
|
| 56 |
#' |
|
| 57 |
#' This function takes a data.frame and attempts to convert it into |
|
| 58 |
#' a simple ascii format suitable for printing to the screen |
|
| 59 |
#' It is assumed all variable values have a as.character() method |
|
| 60 |
#' in order to cast them to character. |
|
| 61 |
#' @param dat Input dataset to convert into a ascii table |
|
| 62 |
#' @param line_prefix Symbols to prefix infront of every line of the table |
|
| 63 |
as_ascii_table <- function(dat, line_prefix = " "){
|
|
| 64 |
|
|
| 65 |
|
|
| 66 |
## Convert every value to character and crop to a suitable length |
|
| 67 | 56x |
dat <- as_tibble(apply(dat, c(1, 2), as_cropped_char)) |
| 68 |
|
|
| 69 | 56x |
hold <- list() |
| 70 | 56x |
COLS <- colnames(dat) |
| 71 |
|
|
| 72 |
### For each column extract core elements (width, values , title) and pad out |
|
| 73 |
### each string to be a suitable length |
|
| 74 | 56x |
for ( i in 1:ncol(dat)){
|
| 75 | 179x |
COL <- COLS[i] |
| 76 | 179x |
VALUES <- dat[[i]] |
| 77 |
|
|
| 78 | 179x |
JOINT <- c(COL , VALUES) |
| 79 | 179x |
WIDTH <- max( sapply(JOINT, nchar)) + 2 |
| 80 |
|
|
| 81 | 179x |
hold[[COL]] <- list() |
| 82 | 179x |
hold[[COL]]$WIDTH <- WIDTH |
| 83 | 179x |
hold[[COL]]$VALUES <- sapply( VALUES ,string_pad, width = WIDTH ) |
| 84 | 179x |
hold[[COL]]$HEADER <- sapply( COL ,string_pad, width = WIDTH ) |
| 85 |
} |
|
| 86 |
|
|
| 87 |
### Collapse into a single value per component ( title , values, width ) |
|
| 88 | 56x |
thold <- invert(hold) |
| 89 | 56x |
tvals <- recursive_reduce( thold$VALUES , paste0 ) |
| 90 | 56x |
thead <- recursive_reduce( thold$HEADER , paste0) |
| 91 | 56x |
twidth <- recursive_reduce( thold$WIDTH , sum) |
| 92 |
|
|
| 93 |
### Create header and footer lines |
|
| 94 | 56x |
TLINE <- paste0(rep("=" , twidth), collapse = "")
|
| 95 | 56x |
LINE <- paste0(rep("-" , twidth), collapse = "")
|
| 96 | 56x |
FVALS <- paste0(line_prefix, tvals , collapse = "\n") |
| 97 |
|
|
| 98 |
### Output table |
|
| 99 | 56x |
paste0( |
| 100 | 56x |
"\n", |
| 101 | 56x |
line_prefix, TLINE, "\n", |
| 102 | 56x |
line_prefix, thead, "\n", |
| 103 | 56x |
line_prefix, LINE, "\n", |
| 104 | 56x |
FVALS, "\n", |
| 105 | 56x |
line_prefix, LINE |
| 106 |
) |
|
| 107 |
} |
|
| 108 | ||
| 109 | ||
| 110 | ||
| 111 | ||
| 112 | ||
| 113 | ||
| 114 | ||
| 115 | ||
| 116 |
#' as_cropped_char |
|
| 117 |
#' |
|
| 118 |
#' Makes any character string above x chars |
|
| 119 |
#' Reduce down to a x char string with ... |
|
| 120 |
#' @param inval a single element value |
|
| 121 |
#' @param crop_at character limit |
|
| 122 |
as_cropped_char <- function(inval, crop_at = 30 ){
|
|
| 123 |
|
|
| 124 | 635x |
if ( is.null(inval) ){
|
| 125 |
|
|
| 126 | ! |
inval <- "<NULL>" |
| 127 |
|
|
| 128 | 635x |
} else if ( is.na(inval)){
|
| 129 |
|
|
| 130 | 1x |
inval <- "<NA>" |
| 131 |
|
|
| 132 |
} else {
|
|
| 133 |
|
|
| 134 | 634x |
inval <- as.character(inval) |
| 135 |
|
|
| 136 |
} |
|
| 137 |
|
|
| 138 | 635x |
charlength <- sapply(inval, nchar) |
| 139 |
|
|
| 140 | 635x |
if (charlength > crop_at ){
|
| 141 |
|
|
| 142 | 10x |
outval <- substr(inval, 1, crop_at ) |
| 143 | 10x |
outval <- paste0(outval, '...') |
| 144 |
|
|
| 145 |
} else {
|
|
| 146 |
|
|
| 147 | 625x |
outval <- inval |
| 148 |
|
|
| 149 |
} |
|
| 150 |
|
|
| 151 | 635x |
outval |
| 152 |
} |
|
| 153 | ||
| 154 | ||
| 155 | ||
| 156 |
#' get_table |
|
| 157 |
#' |
|
| 158 |
#' Generate nice looking table from a data frame |
|
| 159 |
#' @param dsin dataset |
|
| 160 |
#' @param row_limit Maximum number of rows displayed in dataset |
|
| 161 |
get_table <- function(dsin , row_limit = 10){
|
|
| 162 |
|
|
| 163 | 56x |
if( nrow(dsin) == 0 ) {
|
| 164 | ! |
return("")
|
| 165 |
} |
|
| 166 |
|
|
| 167 | 56x |
display_table <- subset(dsin , 1:nrow(dsin) < (row_limit + 1) ) |
| 168 |
|
|
| 169 | 56x |
if ( nrow(dsin) > row_limit ){
|
| 170 |
|
|
| 171 | 4x |
add_message <- paste0( |
| 172 | 4x |
'First ', |
| 173 | 4x |
row_limit, |
| 174 | 4x |
" of " , |
| 175 | 4x |
nrow(dsin), |
| 176 | 4x |
' rows are shown in table below' |
| 177 |
) |
|
| 178 |
|
|
| 179 |
} else {
|
|
| 180 | 52x |
add_message <- 'All rows are shown in table below' |
| 181 |
} |
|
| 182 |
|
|
| 183 | 56x |
msg <- paste( |
| 184 | 56x |
c( |
| 185 | 56x |
add_message, |
| 186 | 56x |
as_ascii_table(display_table), |
| 187 | 56x |
'\n' |
| 188 |
), |
|
| 189 | 56x |
collapse = '\n' |
| 190 |
) |
|
| 191 |
|
|
| 192 | 56x |
return(msg) |
| 193 |
} |
|
| 194 | ||
| 195 |
| 1 | ||
| 2 |
#' Print diffdf objects |
|
| 3 |
#' |
|
| 4 |
#' Print nicely formatted version of an diffdf object |
|
| 5 |
#' @param x comparison object created by diffdf(). |
|
| 6 |
#' @param ... Additional arguments (not used) |
|
| 7 |
#' @param as_string Return printed message as an R character vector? |
|
| 8 |
#' @examples |
|
| 9 |
#' x <- subset( iris , -Species ) |
|
| 10 |
#' x[1,2] <- 5 |
|
| 11 |
#' COMPARE <- diffdf( iris, x) |
|
| 12 |
#' print( COMPARE ) |
|
| 13 |
#' print( COMPARE , "Sepal.Length" ) |
|
| 14 |
#' @export |
|
| 15 |
print.diffdf <- function(x, ..., as_string = FALSE){
|
|
| 16 | 21x |
COMPARE <- x |
| 17 | ||
| 18 | 21x |
if ( length(COMPARE) == 0 ){
|
| 19 | 2x |
outtext <- "No issues were found!\n" |
| 20 |
|
|
| 21 |
} else {
|
|
| 22 |
|
|
| 23 | 19x |
start_text <- paste0( |
| 24 | 19x |
'Differences found between the objects!\n\n', |
| 25 | 19x |
'A summary is given below.\n\n' |
| 26 |
) |
|
| 27 | ||
| 28 | 19x |
end_text <- lapply(COMPARE, function(x) get_print_message(x) ) |
| 29 | 19x |
end_text <- paste0(unlist(end_text), collapse = "") |
| 30 | ||
| 31 | 19x |
outtext <- paste0(start_text, end_text) |
| 32 |
} |
|
| 33 |
|
|
| 34 | 21x |
if ( as_string){
|
| 35 | 21x |
return(strsplit(outtext, '\n')[[1]]) |
| 36 |
} else {
|
|
| 37 | ! |
cat(outtext) |
| 38 | ! |
return(invisible(COMPARE)) |
| 39 |
} |
|
| 40 |
} |
|
| 41 | ||
| 42 | ||
| 43 |
| 1 |
#' construct_issue |
|
| 2 |
#' |
|
| 3 |
#' Make an s3 object with class issue and possible additional class, |
|
| 4 |
#' and assign other arguments to attributes |
|
| 5 |
#' @param value the value of the object |
|
| 6 |
#' @param message the value of the message attribute |
|
| 7 |
#' @param add_class additional class to add |
|
| 8 |
construct_issue <- function(value, message, add_class = NULL){
|
|
| 9 | 2280x |
x <- value |
| 10 |
|
|
| 11 |
### If nothing has been provided return nothing ! |
|
| 12 | 2074x |
if ( nrow(x) == 0 ) return(NULL) |
| 13 |
|
|
| 14 | 206x |
class(x) <- c(add_class, "issue", class(x)) |
| 15 | 206x |
attributes(x)[["message"]] <- message |
| 16 | 206x |
return(x) |
| 17 |
} |
|
| 18 | ||
| 19 | ||
| 20 |
#' get_issue_message |
|
| 21 |
#' |
|
| 22 |
#' Simple function to grab the issue message |
|
| 23 |
#' @param object inputted object of class issue |
|
| 24 |
#' @param ... other arguments |
|
| 25 |
get_issue_message <- function (object, ...) {
|
|
| 26 | 206x |
return( attr(object,"message")) |
| 27 |
} |
|
| 28 | ||
| 29 | ||
| 30 |
#' get_print_message |
|
| 31 |
#' |
|
| 32 |
#' Get the required text depending on type of issue |
|
| 33 |
#' @param object inputted object of class issue |
|
| 34 |
#' @param ... other arguments |
|
| 35 |
get_print_message <- function (object, ...) {
|
|
| 36 | 56x |
UseMethod("get_print_message", object)
|
| 37 |
} |
|
| 38 | ||
| 39 | ||
| 40 |
#' get_print_message.default |
|
| 41 |
#' |
|
| 42 |
#' Errors, as this should only ever be given an issue |
|
| 43 |
#' @param object issue |
|
| 44 |
get_print_message.default <- function(object) {
|
|
| 45 | ! |
stop("Error: An issue has not been provided to this function!")
|
| 46 |
} |
|
| 47 | ||
| 48 | ||
| 49 |
#' get_print_message.issue |
|
| 50 |
#' |
|
| 51 |
#' Get text from a basic issue, based on the class of the value of the issue |
|
| 52 |
#' |
|
| 53 |
#' @param object an object of class issue_basic |
|
| 54 |
get_print_message.issue <- function(object){
|
|
| 55 | 56x |
paste( |
| 56 | 56x |
c(attr(object, "message"), get_table(object) ), |
| 57 | 56x |
collapse = '\n' |
| 58 |
) |
|
| 59 |
} |
|
| 60 | ||
| 61 | ||
| 62 | ||
| 63 | ||
| 64 | ||
| 65 | ||
| 66 | ||
| 67 | ||
| 68 | ||
| 69 |
| 1 |
#' is_variable_different |
|
| 2 |
#' |
|
| 3 |
#' This subsets the data set on the variable name, picks out differences and returns a tibble |
|
| 4 |
#' of differences for the given variable |
|
| 5 |
#' @importFrom tibble as_tibble |
|
| 6 |
#' @param variablename name of variable being compared |
|
| 7 |
#' @param keynames name of keys |
|
| 8 |
#' @param datain Inputted dataset with base and compare vectors |
|
| 9 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 10 |
#' @return A boolean vector which is T if target and current are different |
|
| 11 |
is_variable_different <- function (variablename, keynames, datain, ...) {
|
|
| 12 | ||
| 13 | 991x |
xvar <- paste0(variablename,'.x') |
| 14 | 991x |
yvar <- paste0(variablename,'.y') |
| 15 |
|
|
| 16 | 991x |
if ( ! xvar %in% names(datain) | ! yvar %in% names(datain)){
|
| 17 | ! |
stop("Variable does not exist within input dataset")
|
| 18 |
} |
|
| 19 |
|
|
| 20 | 991x |
target <- datain[[xvar]] |
| 21 | 991x |
current <- datain[[yvar]] |
| 22 | 991x |
outvect <- find_difference(target, current, ...) |
| 23 |
|
|
| 24 | 991x |
datain[["VARIABLE"]] <- variablename |
| 25 |
|
|
| 26 | 991x |
names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
|
| 27 |
|
|
| 28 | 991x |
x <- as_tibble( |
| 29 | 991x |
subset( |
| 30 | 991x |
datain, |
| 31 | 991x |
outvect, |
| 32 | 991x |
select = c("VARIABLE", keynames, "BASE", "COMPARE")
|
| 33 |
) |
|
| 34 |
) |
|
| 35 |
|
|
| 36 | 991x |
return(x) |
| 37 |
} |
|
| 38 | ||
| 39 |
#' compare_vectors |
|
| 40 |
#' |
|
| 41 |
#' Compare two vectors looking for differences |
|
| 42 |
#' |
|
| 43 |
#' @param target the base vector |
|
| 44 |
#' @param current a vector to compare target to |
|
| 45 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 46 |
compare_vectors <- function (target, current, ...) {
|
|
| 47 | 1020x |
UseMethod("compare_vectors")
|
| 48 |
} |
|
| 49 | ||
| 50 | ||
| 51 |
#' find_difference |
|
| 52 |
#' |
|
| 53 |
#' This determines if two vectors are different. It expects vectors of the same |
|
| 54 |
#' length and type, and is intended to be used after checks have already been done |
|
| 55 |
#' Initially picks out any nas (matching nas count as a match) |
|
| 56 |
#' Then compares remaining vector |
|
| 57 |
#' |
|
| 58 |
#' @param target the base vector |
|
| 59 |
#' @param current a vector to compare target to |
|
| 60 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 61 |
find_difference <- function (target, current, ...) {
|
|
| 62 |
|
|
| 63 | 1023x |
if( length(target) != length(current)){
|
| 64 | 2x |
warning("Inputs are not of the same length")
|
| 65 | 2x |
return(NULL) |
| 66 |
} |
|
| 67 |
|
|
| 68 | 1021x |
if( is.null(target) | is.null(current) ){
|
| 69 | 1x |
return( is.null(target) != is.null(current) ) |
| 70 |
} |
|
| 71 |
|
|
| 72 |
### Initalise output, assume problem unless evidence otherwise |
|
| 73 | 1020x |
return_vector <- rep(TRUE, length(target)) |
| 74 |
|
|
| 75 | 1020x |
nas_t <- is.na(target) |
| 76 | 1020x |
nas_c <- is.na(current) |
| 77 |
|
|
| 78 |
## compare missing values |
|
| 79 | 1020x |
nacompare <- nas_t != nas_c |
| 80 | 1020x |
naselect <- nas_t|nas_c |
| 81 | 1020x |
return_vector[naselect] <- nacompare[naselect] |
| 82 |
|
|
| 83 |
## compare non-missing values |
|
| 84 | 1020x |
selectvector <- as.logical( (!nas_t) * (!nas_c) ) |
| 85 |
|
|
| 86 | 1020x |
comparevect <- compare_vectors( |
| 87 | 1020x |
target[selectvector] , |
| 88 | 1020x |
current[selectvector], |
| 89 |
... |
|
| 90 |
) |
|
| 91 |
|
|
| 92 | 1020x |
return_vector[selectvector] <- comparevect |
| 93 |
|
|
| 94 | 1020x |
return(return_vector) |
| 95 |
} |
|
| 96 | ||
| 97 | ||
| 98 | ||
| 99 | ||
| 100 | ||
| 101 | ||
| 102 | ||
| 103 |
#' compare_vectors.default |
|
| 104 |
#' |
|
| 105 |
#' Default method, if the vector is not numeric or factor. Basic comparison |
|
| 106 |
#' @param target the base vector |
|
| 107 |
#' @param current a vector to compare target to |
|
| 108 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 109 |
compare_vectors.default <- function(target, current, ...){
|
|
| 110 | 445x |
target != current |
| 111 |
} |
|
| 112 | ||
| 113 | ||
| 114 | ||
| 115 | ||
| 116 |
#' compare_vectors.factor |
|
| 117 |
#' |
|
| 118 |
#' Compares factors. Sets them as character and then compares |
|
| 119 |
#' @param target the base vector |
|
| 120 |
#' @param current a vector to compare target to |
|
| 121 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 122 |
compare_vectors.factor <- function(target, current, ...){
|
|
| 123 | 90x |
as.character(target) != as.character(current) |
| 124 |
} |
|
| 125 | ||
| 126 | ||
| 127 | ||
| 128 | ||
| 129 | ||
| 130 |
#' compare_vectors.numeric |
|
| 131 |
#' |
|
| 132 |
#' This is a modified version of the all.equal function |
|
| 133 |
#' which returns a vector rather than a message |
|
| 134 |
#' @param target the base vector |
|
| 135 |
#' @param current a vector to compare target to |
|
| 136 |
#' @param tolerance Level of tolerance for differences between two variables |
|
| 137 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
| 138 |
compare_vectors.numeric <- function( |
|
| 139 |
target, |
|
| 140 |
current, |
|
| 141 |
tolerance = sqrt(.Machine$double.eps), |
|
| 142 |
scale = NULL |
|
| 143 |
){
|
|
| 144 | ||
| 145 | 485x |
out <- target == current |
| 146 |
|
|
| 147 | 485x |
if (all(out)) {
|
| 148 | 451x |
return(!out) |
| 149 |
} |
|
| 150 |
|
|
| 151 | 34x |
if (is.integer(target) || is.integer(current)){
|
| 152 | 11x |
target <- as.double(target) |
| 153 | 11x |
current <- as.double(current) |
| 154 |
} |
|
| 155 |
|
|
| 156 | 34x |
xy <- abs(target - current) |
| 157 |
|
|
| 158 | 34x |
if (!is.null(scale)) {
|
| 159 | 2x |
xy <- xy/scale |
| 160 |
} |
|
| 161 |
|
|
| 162 | 34x |
return(xy > tolerance) |
| 163 | ||
| 164 |
} |
|
| 165 | ||
| 166 | ||
| 167 | ||
| 168 |
| 1 |
#'generate_keyname |
|
| 2 |
#' |
|
| 3 |
#'Function to generate a name for the keys if not provided |
|
| 4 |
#' |
|
| 5 |
#'@param BASE base dataset |
|
| 6 |
#'@param COMP comparison dataset |
|
| 7 |
#'@param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity |
|
| 8 |
#' |
|
| 9 |
generate_keyname <- function( |
|
| 10 |
BASE, |
|
| 11 |
COMP, |
|
| 12 |
replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
|
|
| 13 |
){
|
|
| 14 |
|
|
| 15 | 213x |
if ( class(replace_names) != "character"){
|
| 16 | ! |
stop( "replace_names is not a character vector") |
| 17 |
} |
|
| 18 |
|
|
| 19 | 213x |
if (length(replace_names) == 0) {
|
| 20 | 8x |
stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument")
|
| 21 |
} |
|
| 22 |
|
|
| 23 | 205x |
key_name <- replace_names[1] |
| 24 |
|
|
| 25 | 205x |
if (!is.null(BASE[[key_name]]) | !is.null( COMP[[key_name]])){
|
| 26 | 66x |
key_name <- generate_keyname(BASE, COMP, replace_names[-1]) |
| 27 |
} |
|
| 28 |
|
|
| 29 | 173x |
return(key_name) |
| 30 |
|
|
| 31 |
} |
| 1 | ||
| 2 |
#' sort_then_join |
|
| 3 |
#' |
|
| 4 |
#' Convenience function to sort two strings and paste them together |
|
| 5 |
#' @param string1 first string |
|
| 6 |
#' @param string2 second string |
|
| 7 |
sort_then_join <- function(string1, string2){
|
|
| 8 | 14x |
paste0(sort(c(string1, string2)), collapse = '') |
| 9 |
} |
|
| 10 | ||
| 11 | ||
| 12 |
#' class_merge |
|
| 13 |
#' |
|
| 14 |
#' Convenience function to put all classes an object has into one string |
|
| 15 |
#' @param x an object |
|
| 16 |
class_merge <- function(x){
|
|
| 17 | 304x |
paste(class(x), collapse = "_") |
| 18 |
} |
|
| 19 | ||
| 20 | ||
| 21 |
get_message <- function( colname , whichdat, totype){
|
|
| 22 | 12x |
message(paste0( |
| 23 | 12x |
"NOTE: Variable " , colname, " in " , tolower(whichdat) , " was casted to " , totype |
| 24 |
)) |
|
| 25 |
} |
|
| 26 | ||
| 27 | ||
| 28 |
#' get_casted_vector |
|
| 29 |
#' |
|
| 30 |
#' casts a vector depending on its type and input |
|
| 31 |
#' @param colin column to cast |
|
| 32 |
#' @param colname name of vector |
|
| 33 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
| 34 |
get_casted_vector <- function(colin, colname, whichdat){
|
|
| 35 |
|
|
| 36 | 24x |
if ( class(colin) == "factor"){
|
| 37 | 6x |
get_message( colname , whichdat , "character") |
| 38 | 6x |
return(as.character(colin)) |
| 39 |
} |
|
| 40 |
|
|
| 41 | 18x |
if ( class(colin) == "integer"){
|
| 42 | 6x |
get_message( colname , whichdat , "numeric") |
| 43 | 6x |
return(as.numeric(colin)) |
| 44 |
} |
|
| 45 |
|
|
| 46 | 12x |
colin |
| 47 |
} |
|
| 48 | ||
| 49 | ||
| 50 | ||
| 51 |
#' get_casted_dataset |
|
| 52 |
#' |
|
| 53 |
#' Internal utility function to loop across a dataset casting all target |
|
| 54 |
#' variables |
|
| 55 |
#' @param df dataset to be casted |
|
| 56 |
#' @param columns columns to be casted |
|
| 57 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
| 58 |
get_casted_dataset <- function(df , columns , whichdat){
|
|
| 59 | 22x |
for ( col in columns ){
|
| 60 | 24x |
df[[col]] <- get_casted_vector( df[[col]] , col , whichdat ) |
| 61 |
} |
|
| 62 | 22x |
return(df) |
| 63 |
} |
|
| 64 | ||
| 65 | ||
| 66 | ||
| 67 | ||
| 68 | ||
| 69 | ||
| 70 |
#' cast_variables |
|
| 71 |
#' |
|
| 72 |
#' Function to cast datasets columns if they have differing types |
|
| 73 |
#' Restricted to specific cases, currently integer and double, and character and factor |
|
| 74 |
#' @param BASE base dataset |
|
| 75 |
#' @param COMPARE comparison dataset |
|
| 76 |
#' @param ignore_vars Variables not to be considered for casting |
|
| 77 |
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles |
|
| 78 |
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters |
|
| 79 |
cast_variables <- function( |
|
| 80 |
BASE, |
|
| 81 |
COMPARE, |
|
| 82 |
ignore_vars = NULL, |
|
| 83 |
cast_integers = FALSE , |
|
| 84 |
cast_factors = FALSE |
|
| 85 |
){
|
|
| 86 | ||
| 87 | 13x |
allowed_class_casts <- c( "integernumeric" , "characterfactor")[c(cast_integers, cast_factors)] |
| 88 |
|
|
| 89 | 13x |
BASE_class <- data.frame( |
| 90 | 13x |
class_BASE = sapply(BASE, class_merge), |
| 91 | 13x |
colname = names(BASE), |
| 92 | 13x |
stringsAsFactors = FALSE |
| 93 |
) |
|
| 94 | 13x |
BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars,, drop=FALSE] |
| 95 |
|
|
| 96 |
|
|
| 97 | 13x |
COMPARE_class <- data.frame( |
| 98 | 13x |
class_COMPARE = sapply(COMPARE, class_merge), |
| 99 | 13x |
colname = names(COMPARE), |
| 100 | 13x |
stringsAsFactors = FALSE |
| 101 |
) |
|
| 102 | 13x |
COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars ,, drop=FALSE] |
| 103 |
|
|
| 104 | 13x |
common_class <- merge( |
| 105 | 13x |
x = BASE_class, |
| 106 | 13x |
y = COMPARE_class, |
| 107 | 13x |
by = "colname" |
| 108 |
) |
|
| 109 |
|
|
| 110 |
|
|
| 111 | 13x |
diff_class <- common_class[ common_class[["class_BASE"]] != common_class[["class_COMPARE"]] ,,drop=FALSE] |
| 112 | ||
| 113 |
|
|
| 114 | 13x |
diff_class$classmerge <- mapply( |
| 115 | 13x |
sort_then_join, |
| 116 | 13x |
diff_class$class_COMPARE, |
| 117 | 13x |
diff_class$class_BASE |
| 118 |
) |
|
| 119 |
|
|
| 120 |
|
|
| 121 | 13x |
cast_columns <- diff_class[ diff_class[["classmerge"]] %in% allowed_class_casts ,,drop=FALSE] |
| 122 |
|
|
| 123 |
|
|
| 124 | 13x |
DATASETS <- list( |
| 125 | 13x |
"BASE" = BASE, |
| 126 | 13x |
"COMPARE" = COMPARE |
| 127 |
) |
|
| 128 |
|
|
| 129 | ||
| 130 | 13x |
if( nrow(cast_columns) == 0 ){
|
| 131 | 2x |
return( DATASETS ) |
| 132 |
} |
|
| 133 |
|
|
| 134 |
|
|
| 135 | 11x |
for ( i in names(DATASETS)){
|
| 136 | 22x |
DATASETS[[i]] <- get_casted_dataset( DATASETS[[i]] , cast_columns$colname , i) |
| 137 |
} |
|
| 138 |
|
|
| 139 |
|
|
| 140 | 11x |
return(DATASETS) |
| 141 |
} |
|
| 142 | ||
| 143 | ||
| 144 | ||
| 145 | ||
| 146 | ||
| 147 | ||
| 148 | ||
| 149 | ||
| 150 | ||
| 151 | ||
| 152 | ||
| 153 | ||
| 154 | ||
| 155 | ||
| 156 | ||
| 157 | ||
| 158 | ||
| 159 | ||
| 160 | ||
| 161 | ||
| 162 | ||
| 163 | ||
| 164 | ||
| 165 | ||
| 166 | ||
| 167 | ||
| 168 | ||
| 169 | ||
| 170 | ||
| 171 | ||
| 172 | ||
| 173 |
| 1 | ||
| 2 | ||
| 3 | ||
| 4 |
#' factor_to_character |
|
| 5 |
#' |
|
| 6 |
#' Takes a dataframe and converts any factor variables to character |
|
| 7 |
#' @param dsin input dataframe |
|
| 8 |
#' @param vars variables to consider for conversion. Default NULL will consider |
|
| 9 |
#' every variable within the dataset |
|
| 10 |
factor_to_character <- function( dsin , vars = NULL){
|
|
| 11 |
|
|
| 12 | ! |
if ( is.null(vars) ) vars = names(dsin) |
| 13 |
|
|
| 14 | 258x |
for (var in vars){
|
| 15 | 272x |
if( is.factor(dsin[[var]])){
|
| 16 | ! |
dsin[[var]] <- as.character(dsin[[var]]) |
| 17 |
} |
|
| 18 |
} |
|
| 19 | 258x |
return(dsin) |
| 20 |
} |
|
| 21 | ||
| 22 | ||
| 23 | ||
| 24 | ||
| 25 |
#' has_unique_rows |
|
| 26 |
#' |
|
| 27 |
#' Check if a data sets rows are unique |
|
| 28 |
#' @param DAT input data set (data frame) |
|
| 29 |
#' @param KEYS Set of keys which should be unique |
|
| 30 |
has_unique_rows <- function(DAT , KEYS){
|
|
| 31 | 259x |
DUPS <- duplicated( subset(DAT , select= KEYS) ) |
| 32 | 259x |
NDUPS <- sum( DUPS) |
| 33 | 259x |
return( NDUPS == 0 ) |
| 34 |
} |
|
| 35 | ||
| 36 |
#'convert_to_issue |
|
| 37 |
#' |
|
| 38 |
#'converts the count value into the correct issue format |
|
| 39 |
#'@param datin data inputted |
|
| 40 |
#'@importFrom tibble rownames_to_column |
|
| 41 |
convert_to_issue <- function(datin){
|
|
| 42 | 128x |
datin_tibble <- tibble( |
| 43 | 128x |
`Variable` = names(datin), |
| 44 | 128x |
`No of Differences` = datin |
| 45 |
) |
|
| 46 |
|
|
| 47 | 128x |
datin_tibble_reduced <- datin_tibble[ datin_tibble[["No of Differences"]] > 0, , drop = FALSE] |
| 48 | 128x |
return(datin_tibble_reduced) |
| 49 |
} |
|
| 50 |