| 1 |
#' Generate unique key name |
|
| 2 |
#' |
|
| 3 |
#' Function to generate a name for the keys if not provided |
|
| 4 |
#' |
|
| 5 |
#' @param BASE base dataset |
|
| 6 |
#' @param COMP comparison dataset |
|
| 7 |
#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity |
|
| 8 |
#' |
|
| 9 |
#' @keywords internal |
|
| 10 |
generate_keyname <- function( |
|
| 11 |
BASE, |
|
| 12 |
COMP, |
|
| 13 |
replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..")
|
|
| 14 |
) {
|
|
| 15 | 199x |
assertthat::assert_that( |
| 16 | 199x |
is(replace_names, "character"), |
| 17 | 199x |
msg = "replace_names is not a character vector" |
| 18 |
) |
|
| 19 | ||
| 20 | 199x |
assertthat::assert_that( |
| 21 | 199x |
length(replace_names) != 0, |
| 22 | 199x |
msg = "All default row names are in use in BASE/COMPARE. Please provide a KEY argument" |
| 23 |
) |
|
| 24 | ||
| 25 | ||
| 26 | 191x |
key_name <- replace_names[1] |
| 27 | ||
| 28 | 191x |
if (!is.null(BASE[[key_name]]) || !is.null(COMP[[key_name]])) {
|
| 29 | 66x |
key_name <- generate_keyname(BASE, COMP, replace_names[-1]) |
| 30 |
} |
|
| 31 | 159x |
return(key_name) |
| 32 |
} |
| 1 |
#' diffdf |
|
| 2 |
#' @description |
|
| 3 |
#' Compares 2 dataframes and outputs any differences. |
|
| 4 |
#' @param base input dataframe |
|
| 5 |
#' @param compare comparison dataframe |
|
| 6 |
#' @param keys vector of variables (as strings) that defines a unique row in |
|
| 7 |
#' the base and compare dataframes |
|
| 8 |
#' @param strict_numeric Flag for strict numeric to numeric comparisons |
|
| 9 |
#' (default = TRUE). If False diffdf will cast integer to double where |
|
| 10 |
#' required for comparisons. Note that variables specified in the keys |
|
| 11 |
#' will never be casted. |
|
| 12 |
#' @param strict_factor Flag for strict factor to character comparisons |
|
| 13 |
#' (default = TRUE). If False diffdf will cast factors to characters where |
|
| 14 |
#' required for comparisons. Note that variables specified in the keys will |
|
| 15 |
#' never be casted. |
|
| 16 |
#' @param suppress_warnings Do you want to suppress warnings? (logical) |
|
| 17 |
#' @param file Location and name of a text file to output the results to. |
|
| 18 |
#' Setting to NULL will cause no file to be produced. |
|
| 19 |
#' @param tolerance Set tolerance for numeric comparisons. Note that |
|
| 20 |
#' comparisons fail if (x-y)/scale > tolerance. |
|
| 21 |
#' @param scale Set scale for numeric comparisons. Note that comparisons fail |
|
| 22 |
#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient |
|
| 23 |
#' version of scale = 1. |
|
| 24 |
#' @param check_column_order Should the column ordering be checked? (logical) |
|
| 25 |
#' @param check_df_class Do you want to check for differences in the class |
|
| 26 |
#' between `base` and `compare`? (logical) |
|
| 27 |
#' @examples |
|
| 28 |
#' x <- subset(iris, -Species) |
|
| 29 |
#' x[1, 2] <- 5 |
|
| 30 |
#' COMPARE <- diffdf(iris, x) |
|
| 31 |
#' print(COMPARE) |
|
| 32 |
#' |
|
| 33 |
#' #### Sample data frames |
|
| 34 |
#' |
|
| 35 |
#' DF1 <- data.frame( |
|
| 36 |
#' id = c(1, 2, 3, 4, 5, 6), |
|
| 37 |
#' v1 = letters[1:6], |
|
| 38 |
#' v2 = c(NA, NA, 1, 2, 3, NA) |
|
| 39 |
#' ) |
|
| 40 |
#' |
|
| 41 |
#' DF2 <- data.frame( |
|
| 42 |
#' id = c(1, 2, 3, 4, 5, 7), |
|
| 43 |
#' v1 = letters[1:6], |
|
| 44 |
#' v2 = c(NA, NA, 1, 2, NA, NA), |
|
| 45 |
#' v3 = c(NA, NA, 1, 2, NA, 4) |
|
| 46 |
#' ) |
|
| 47 |
#' |
|
| 48 |
#' diffdf(DF1, DF1, keys = "id") |
|
| 49 |
#' |
|
| 50 |
#' # We can control matching with scale/location for example: |
|
| 51 |
#' |
|
| 52 |
#' DF1 <- data.frame( |
|
| 53 |
#' id = c(1, 2, 3, 4, 5, 6), |
|
| 54 |
#' v1 = letters[1:6], |
|
| 55 |
#' v2 = c(1, 2, 3, 4, 5, 6) |
|
| 56 |
#' ) |
|
| 57 |
#' DF2 <- data.frame( |
|
| 58 |
#' id = c(1, 2, 3, 4, 5, 6), |
|
| 59 |
#' v1 = letters[1:6], |
|
| 60 |
#' v2 = c(1.1, 2, 3, 4, 5, 6) |
|
| 61 |
#' ) |
|
| 62 |
#' |
|
| 63 |
#' diffdf(DF1, DF2, keys = "id") |
|
| 64 |
#' diffdf(DF1, DF2, keys = "id", tolerance = 0.2) |
|
| 65 |
#' diffdf(DF1, DF2, keys = "id", scale = 10, tolerance = 0.2) |
|
| 66 |
#' |
|
| 67 |
#' # We can use strict_factor to compare factors with characters for example: |
|
| 68 |
#' |
|
| 69 |
#' DF1 <- data.frame( |
|
| 70 |
#' id = c(1, 2, 3, 4, 5, 6), |
|
| 71 |
#' v1 = letters[1:6], |
|
| 72 |
#' v2 = c(NA, NA, 1, 2, 3, NA), |
|
| 73 |
#' stringsAsFactors = FALSE |
|
| 74 |
#' ) |
|
| 75 |
#' |
|
| 76 |
#' DF2 <- data.frame( |
|
| 77 |
#' id = c(1, 2, 3, 4, 5, 6), |
|
| 78 |
#' v1 = letters[1:6], |
|
| 79 |
#' v2 = c(NA, NA, 1, 2, 3, NA) |
|
| 80 |
#' ) |
|
| 81 |
#' |
|
| 82 |
#' diffdf(DF1, DF2, keys = "id", strict_factor = TRUE) |
|
| 83 |
#' diffdf(DF1, DF2, keys = "id", strict_factor = FALSE) |
|
| 84 |
#' |
|
| 85 |
#' @export |
|
| 86 |
diffdf <- function( |
|
| 87 |
base, |
|
| 88 |
compare, |
|
| 89 |
keys = NULL, |
|
| 90 |
suppress_warnings = FALSE, |
|
| 91 |
strict_numeric = TRUE, |
|
| 92 |
strict_factor = TRUE, |
|
| 93 |
file = NULL, |
|
| 94 |
tolerance = sqrt(.Machine$double.eps), |
|
| 95 |
scale = NULL, |
|
| 96 |
check_column_order = FALSE, |
|
| 97 |
check_df_class = FALSE |
|
| 98 |
) {
|
|
| 99 | ||
| 100 | 131x |
assertthat::assert_that( |
| 101 | 131x |
assertthat::is.flag(check_df_class), |
| 102 | 131x |
!is.na(check_df_class), |
| 103 | 131x |
msg = "`check_df_class` must be a length 1 logical" |
| 104 |
) |
|
| 105 | ||
| 106 | 131x |
BASE <- base |
| 107 | 131x |
COMP <- compare |
| 108 | 131x |
KEYS <- keys |
| 109 | 131x |
SUPWARN <- suppress_warnings |
| 110 | ||
| 111 |
### Initatiate output object |
|
| 112 | 131x |
COMPARE <- list() |
| 113 | 131x |
class(COMPARE) <- c("diffdf", "list")
|
| 114 | ||
| 115 | ||
| 116 | 131x |
BASE_NAME <- deparse(substitute(base), width.cutoff = 30L) |
| 117 | 131x |
if (length(BASE_NAME) > 1) {
|
| 118 | 2x |
BASE_NAME <- "<BASE>" |
| 119 |
} |
|
| 120 | 131x |
COMP_NAME <- deparse(substitute(compare), width.cutoff = 30L) |
| 121 | 131x |
if (length(COMP_NAME) > 1) {
|
| 122 | ! |
COMP_NAME <- "<COMPARE>" |
| 123 |
} |
|
| 124 | ||
| 125 | 131x |
COMPARE[["DataSummary"]] <- construct_issue( |
| 126 | 131x |
value = describe_dataframe(BASE, COMP, BASE_NAME, COMP_NAME), |
| 127 | 131x |
message = "Summary of BASE and COMPARE" |
| 128 |
) |
|
| 129 | ||
| 130 | ||
| 131 | 131x |
is_derived <- FALSE |
| 132 | ||
| 133 |
### If no key is suplied match values based upon row number |
|
| 134 | 131x |
if (is.null(KEYS)) {
|
| 135 | 108x |
is_derived <- TRUE |
| 136 | 108x |
keyname <- generate_keyname(BASE, COMP) |
| 137 | 108x |
BASE[[keyname]] <- seq_len(nrow(BASE)) |
| 138 | 108x |
COMP[[keyname]] <- seq_len(nrow(COMP)) |
| 139 | 108x |
KEYS <- keyname |
| 140 |
} |
|
| 141 | 131x |
attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived) |
| 142 | ||
| 143 | 131x |
assertthat::assert_that( |
| 144 | 131x |
is.numeric(tolerance), |
| 145 | 131x |
is.numeric(scale) || is.null(scale) |
| 146 |
) |
|
| 147 | ||
| 148 | 127x |
missing_keys_base <- KEYS[!KEYS %in% names(BASE)] |
| 149 | 127x |
assertthat::assert_that( |
| 150 | 127x |
length(missing_keys_base) == 0, |
| 151 | 127x |
msg = sprintf( |
| 152 | 127x |
"The following KEYS are not available in BASE:\n %s", |
| 153 | 127x |
paste(missing_keys_base, collapse = "\n ") |
| 154 |
) |
|
| 155 |
) |
|
| 156 | ||
| 157 | 126x |
missing_keys_comp <- KEYS[!KEYS %in% names(COMP)] |
| 158 | 126x |
assertthat::assert_that( |
| 159 | 126x |
length(missing_keys_comp) == 0, |
| 160 | 126x |
msg = sprintf( |
| 161 | 126x |
"The following KEYS are not available in COMPARE:\n %s", |
| 162 | 126x |
paste(missing_keys_comp, collapse = "\n ") |
| 163 |
) |
|
| 164 |
) |
|
| 165 | ||
| 166 | 125x |
assertthat::assert_that( |
| 167 | 125x |
has_unique_rows(BASE, KEYS), |
| 168 | 125x |
msg = "BY variables in BASE do not result in unique observations" |
| 169 |
) |
|
| 170 | ||
| 171 | 124x |
assertthat::assert_that( |
| 172 | 124x |
has_unique_rows(COMP, KEYS), |
| 173 | 124x |
msg = "BY variables in COMPARE do not result in unique observations" |
| 174 |
) |
|
| 175 | ||
| 176 | ||
| 177 | ||
| 178 |
#### Check essential variable properties (class & mode) |
|
| 179 | ||
| 180 | 124x |
COMPARE[["UnsupportedColsBase"]] <- construct_issue( |
| 181 | 124x |
value = identify_unsupported_cols(BASE), |
| 182 | 124x |
message = "There are columns in BASE with unsupported modes !!" |
| 183 |
) |
|
| 184 | ||
| 185 | ||
| 186 | 124x |
COMPARE[["UnsupportedColsComp"]] <- construct_issue( |
| 187 | 124x |
value = identify_unsupported_cols(COMP), |
| 188 | 124x |
message = "There are columns in COMPARE with unsupported modes !!" |
| 189 |
) |
|
| 190 | ||
| 191 | ||
| 192 |
# cast variables if strict is off |
|
| 193 | 124x |
if (!strict_factor || !strict_numeric) {
|
| 194 | 9x |
casted_df <- cast_variables( |
| 195 | 9x |
BASE = BASE, |
| 196 | 9x |
COMPARE = COMP, |
| 197 | 9x |
ignore_vars = KEYS, |
| 198 | 9x |
cast_integers = !strict_numeric, |
| 199 | 9x |
cast_factors = !strict_factor |
| 200 |
) |
|
| 201 | ||
| 202 | 9x |
BASE <- casted_df$BASE |
| 203 | 9x |
COMP <- casted_df$COMP |
| 204 |
} |
|
| 205 | ||
| 206 | ||
| 207 | 124x |
COMPARE[["VarModeDiffs"]] <- construct_issue( |
| 208 | 124x |
value = identify_mode_differences(BASE, COMP), |
| 209 | 124x |
message = "There are columns in BASE and COMPARE with different modes !!" |
| 210 |
) |
|
| 211 | ||
| 212 | ||
| 213 | 124x |
COMPARE[["VarClassDiffs"]] <- construct_issue( |
| 214 | 124x |
value = identify_class_differences(BASE, COMP), |
| 215 | 124x |
message = "There are columns in BASE and COMPARE with different classes !!" |
| 216 |
) |
|
| 217 | ||
| 218 | ||
| 219 | ||
| 220 | ||
| 221 |
##### Check Validity of Keys |
|
| 222 | ||
| 223 | 124x |
BASE_keys <- names(BASE)[names(BASE) %in% KEYS] |
| 224 | 124x |
COMP_keys <- names(COMP)[names(COMP) %in% KEYS] |
| 225 | ||
| 226 | 124x |
assertthat::assert_that( |
| 227 | 124x |
length(BASE_keys) == length(KEYS), |
| 228 | 124x |
msg = "BASE is missing variables specified in KEYS" |
| 229 |
) |
|
| 230 | ||
| 231 | 124x |
assertthat::assert_that( |
| 232 | 124x |
length(COMP_keys) == length(KEYS), |
| 233 | 124x |
msg = "COMP is missing variables specified in KEYS" |
| 234 |
) |
|
| 235 | ||
| 236 | ||
| 237 | 124x |
assert_valid_keys( |
| 238 | 124x |
COMPARE, KEYS, "UnsupportedColsBase", |
| 239 | 124x |
"The following KEYS in BASE have an unsupported mode (see `?mode()`)" |
| 240 |
) |
|
| 241 | 123x |
assert_valid_keys( |
| 242 | 123x |
COMPARE, KEYS, "UnsupportedColsComp", |
| 243 | 123x |
"The following KEYS in COMPARE have an unsupported mode (see `?mode()`)" |
| 244 |
) |
|
| 245 | 122x |
assert_valid_keys( |
| 246 | 122x |
COMPARE, KEYS, "VarModeDiffs", |
| 247 | 122x |
"The following KEYS have different modes between BASE and COMPARE" |
| 248 |
) |
|
| 249 | 120x |
assert_valid_keys( |
| 250 | 120x |
COMPARE, KEYS, "VarClassDiffs", |
| 251 | 120x |
"The following KEYS have different classes between BASE and COMPARE" |
| 252 |
) |
|
| 253 | ||
| 254 | ||
| 255 | 119x |
exclude_cols <- c( |
| 256 | 119x |
COMPARE[["UnsupportedColsBase"]]$VARIABLE, |
| 257 | 119x |
COMPARE[["UnsupportedColsComp"]]$VARIABLE, |
| 258 | 119x |
COMPARE[["VarClassDiffs"]]$VARIABLE, |
| 259 | 119x |
COMPARE[["VarModeDiffs"]]$VARIABLE |
| 260 |
) |
|
| 261 | ||
| 262 | 119x |
if (check_column_order) {
|
| 263 | 5x |
if (attr(COMPARE, "keys")$is_derived) {
|
| 264 | 4x |
keep_vars_base <- !(names(BASE) %in% attr(COMPARE, "keys")$value) |
| 265 | 4x |
keep_vars_comp <- !(names(COMP) %in% attr(COMPARE, "keys")$value) |
| 266 |
} else {
|
|
| 267 | 1x |
keep_vars_base <- TRUE |
| 268 | 1x |
keep_vars_comp <- TRUE |
| 269 |
} |
|
| 270 | 5x |
COMPARE[["ColumnOrder"]] <- construct_issue( |
| 271 | 5x |
value = identify_column_order_differences( |
| 272 | 5x |
BASE[, keep_vars_base, drop = FALSE], |
| 273 | 5x |
COMP[, keep_vars_comp, drop = FALSE] |
| 274 |
), |
|
| 275 | 5x |
message = "There are differences in the column ordering between BASE and COMPARE !!" |
| 276 |
) |
|
| 277 |
} |
|
| 278 | ||
| 279 | ||
| 280 |
##### Check Attributes |
|
| 281 | 119x |
COMPARE[["AttribDiffs"]] <- construct_issue( |
| 282 | 119x |
value = identify_att_differences(BASE, COMP, exclude_cols), |
| 283 | 119x |
message = "There are columns in BASE and COMPARE with differing attributes !!" |
| 284 |
) |
|
| 285 | ||
| 286 | ||
| 287 |
##### Check data |
|
| 288 | ||
| 289 | 119x |
BASE <- factor_to_character(BASE, KEYS) |
| 290 | 119x |
COMP <- factor_to_character(COMP, KEYS) |
| 291 | ||
| 292 | ||
| 293 | 119x |
COMPARE[["ExtRowsBase"]] <- construct_issue( |
| 294 | 119x |
value = identify_extra_rows(BASE, COMP, KEYS), |
| 295 | 119x |
message = "There are rows in BASE that are not in COMPARE !!" |
| 296 |
) |
|
| 297 | ||
| 298 | ||
| 299 | 119x |
COMPARE[["ExtRowsComp"]] <- construct_issue( |
| 300 | 119x |
value = identify_extra_rows(COMP, BASE, KEYS), |
| 301 | 119x |
message = "There are rows in COMPARE that are not in BASE !!" |
| 302 |
) |
|
| 303 | ||
| 304 | ||
| 305 | 119x |
COMPARE[["ExtColsBase"]] <- construct_issue( |
| 306 | 119x |
value = identify_extra_cols(BASE, COMP), |
| 307 | 119x |
message = "There are columns in BASE that are not in COMPARE !!" |
| 308 |
) |
|
| 309 | ||
| 310 | ||
| 311 | 119x |
COMPARE[["ExtColsComp"]] <- construct_issue( |
| 312 | 119x |
value = identify_extra_cols(COMP, BASE), |
| 313 | 119x |
message = "There are columns in COMPARE that are not in BASE !!" |
| 314 |
) |
|
| 315 | ||
| 316 | ||
| 317 | 119x |
VALUE_DIFFERENCES <- identify_differences( |
| 318 | 119x |
BASE, COMP, KEYS, exclude_cols, |
| 319 | 119x |
tolerance = tolerance, |
| 320 | 119x |
scale = scale |
| 321 |
) |
|
| 322 | ||
| 323 | ||
| 324 | ||
| 325 |
## Summarise the number of mismatching rows per variable |
|
| 326 | ||
| 327 | 119x |
if (length(VALUE_DIFFERENCES)) {
|
| 328 | 113x |
NDIFF <- sapply(VALUE_DIFFERENCES, nrow) |
| 329 | 113x |
COMPARE[["NumDiff"]] <- construct_issue( |
| 330 | 113x |
value = convert_to_issue(NDIFF), |
| 331 | 113x |
message = "Not all Values Compared Equal" |
| 332 |
) |
|
| 333 |
} |
|
| 334 | ||
| 335 | ||
| 336 | 119x |
for (i in names(VALUE_DIFFERENCES)) {
|
| 337 | 931x |
COMPARE[[paste0("VarDiff_", i)]] <- construct_issue(
|
| 338 | 931x |
value = VALUE_DIFFERENCES[[i]], |
| 339 | 931x |
message = NULL |
| 340 |
) |
|
| 341 |
} |
|
| 342 | ||
| 343 | ||
| 344 |
# suppress warning message of data summary if user didn't request to check it |
|
| 345 |
# we leave the issue in the main compare object though for printing purposes |
|
| 346 | 119x |
COMPARE_WARNINGS <- COMPARE |
| 347 | 119x |
attr(COMPARE_WARNINGS[["DataSummary"]], "message") <- c( |
| 348 | 119x |
"There are differences between the class of BASE and COMPARE" |
| 349 |
) |
|
| 350 | 119x |
if (!check_df_class || identical(class(base), class(compare))) {
|
| 351 | 117x |
COMPARE_WARNINGS["DataSummary"] <- NULL |
| 352 |
} |
|
| 353 | ||
| 354 |
# Get all issue messages, remove blank message, and collapse into single string |
|
| 355 | 119x |
ISSUE_MSGS <- sapply(COMPARE_WARNINGS, function(x) get_issue_message(x)) |
| 356 | 119x |
ISSUE_MSGS <- Filter(function(x) !is.null(x), ISSUE_MSGS) |
| 357 | 119x |
ISSUE_MSGS <- Filter(function(x) x != "", ISSUE_MSGS) |
| 358 | ||
| 359 | 119x |
if (length(ISSUE_MSGS) != 0) {
|
| 360 | 80x |
if (!SUPWARN) {
|
| 361 | 45x |
ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n") |
| 362 | 45x |
warning(c("\n", ISSUE_MSGS))
|
| 363 |
} |
|
| 364 |
} |
|
| 365 | ||
| 366 |
# If the classes are the same and it is the only entry in the compare |
|
| 367 |
# object then remove it in order to trigger "no issues found" |
|
| 368 | 119x |
if (identical(class(base), class(compare)) && length(COMPARE) == 1) {
|
| 369 | 38x |
COMPARE["DataSummary"] <- NULL |
| 370 |
} |
|
| 371 | ||
| 372 |
# If the summary is the only item and the user didn't want to check classes |
|
| 373 |
# then remove the object to trigger the "no issues found" |
|
| 374 | 119x |
if (!check_df_class && length(COMPARE) == 1) {
|
| 375 | 1x |
COMPARE["DataSummary"] <- NULL |
| 376 |
} |
|
| 377 | ||
| 378 | ||
| 379 | 119x |
if (!is.null(file)) {
|
| 380 | 1x |
print(COMPARE, file = file) |
| 381 |
} |
|
| 382 | ||
| 383 | 119x |
return(COMPARE) |
| 384 |
} |
|
| 385 | ||
| 386 | ||
| 387 | ||
| 388 | ||
| 389 |
#' diffdf_has_issues |
|
| 390 |
#' |
|
| 391 |
#' Utility function which returns TRUE if an diffdf |
|
| 392 |
#' object has issues or FALSE if an diffdf object does not have issues |
|
| 393 |
#' @param x diffdf object |
|
| 394 |
#' @examples |
|
| 395 |
#' |
|
| 396 |
#' # Example with no issues |
|
| 397 |
#' x <- diffdf(iris, iris) |
|
| 398 |
#' diffdf_has_issues(x) |
|
| 399 |
#' |
|
| 400 |
#' # Example with issues |
|
| 401 |
#' iris2 <- iris |
|
| 402 |
#' iris2[2, 2] <- NA |
|
| 403 |
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE) |
|
| 404 |
#' diffdf_has_issues(x) |
|
| 405 |
#' @export |
|
| 406 |
diffdf_has_issues <- function(x) {
|
|
| 407 | ! |
if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object")
|
| 408 | 3x |
length(x) != 0 |
| 409 |
} |
|
| 410 | ||
| 411 | ||
| 412 |
#' Assert that keys are valid |
|
| 413 |
#' |
|
| 414 |
#' Utility function to check that user provided "keys" aren't listed as a problem |
|
| 415 |
#' variable of the current list of issues. |
|
| 416 |
#' @param COMPARE (`list`)\cr A named list of which each element is a `data.frame` with the |
|
| 417 |
#' column `VARIABLE` |
|
| 418 |
#' @param KEYS (`character`)\cr name of key variables to check to make sure they don't contain |
|
| 419 |
#' any issues |
|
| 420 |
#' @param component (`character`)\cr name of the component within `COMPARE` to check against |
|
| 421 |
#' @param msg (`character`)\cr error message to print if any of `KEYS` are found within |
|
| 422 |
#' `COMPARE[component]$VARIABLE` |
|
| 423 |
#' @keywords internal |
|
| 424 |
assert_valid_keys <- function(COMPARE, KEYS, component, msg) {
|
|
| 425 | 489x |
keys_reduced <- KEYS[KEYS %in% COMPARE[[component]]$VARIABLE] |
| 426 | 489x |
assertthat::assert_that( |
| 427 | 489x |
length(keys_reduced) == 0, |
| 428 | 489x |
msg = sprintf( |
| 429 | 489x |
"%s:\n%s", |
| 430 | 489x |
msg, |
| 431 | 489x |
paste0("`", paste0(keys_reduced, collapse = "`, `"), "`")
|
| 432 |
) |
|
| 433 |
) |
|
| 434 |
} |
| 1 |
#' Identify Issue Rows |
|
| 2 |
#' |
|
| 3 |
#' This function takes a `diffdf` object and a dataframe and subsets |
|
| 4 |
#' the `data.frame` for problem rows as identified in the comparison object. |
|
| 5 |
#' If \code{vars} has been specified only issue rows associated with those
|
|
| 6 |
#' variable(s) will be returned. |
|
| 7 |
#' @param df dataframe to be subsetted |
|
| 8 |
#' @param diff diffdf object |
|
| 9 |
#' @param vars (optional) character vector containing names of issue variables to subset dataframe |
|
| 10 |
#' on. A value of NULL (default) will be taken to mean available issue variables. |
|
| 11 |
#' @examples |
|
| 12 |
#' iris2 <- iris |
|
| 13 |
#' for (i in 1:3) iris2[i, i] <- 99 |
|
| 14 |
#' x <- diffdf(iris, iris2, suppress_warnings = TRUE) |
|
| 15 |
#' diffdf_issuerows(iris, x) |
|
| 16 |
#' diffdf_issuerows(iris2, x) |
|
| 17 |
#' diffdf_issuerows(iris2, x, vars = "Sepal.Length") |
|
| 18 |
#' diffdf_issuerows(iris2, x, vars = c("Sepal.Length", "Sepal.Width"))
|
|
| 19 |
#' @details |
|
| 20 |
#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only |
|
| 21 |
#' requirement is that the original variables specified in the keys argument to diffdf |
|
| 22 |
#' are present on the dataframe you are subsetting against. However please note that if |
|
| 23 |
#' no keys were specified in diffdf then the row number is used. This means using |
|
| 24 |
#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in |
|
| 25 |
#' nonsense rows being returned. It is always recommended to supply keys to diffdf. |
|
| 26 |
#' @export |
|
| 27 |
diffdf_issuerows <- function(df, diff, vars = NULL) {
|
|
| 28 | ||
| 29 | 19x |
assertthat::assert_that( |
| 30 | 19x |
class(diff)[[1]] == "diffdf" |
| 31 |
) |
|
| 32 | ||
| 33 | ||
| 34 | 19x |
KEYS_ATT <- attr(diff, "keys") |
| 35 | ||
| 36 | 19x |
assertthat::assert_that( |
| 37 | 19x |
!is.null(KEYS_ATT), |
| 38 | 19x |
msg = "diff is missing the keys attribute" |
| 39 |
) |
|
| 40 | ||
| 41 | ||
| 42 | 19x |
issue_vars <- names(diff)[grep("^VarDiff_", names(diff))]
|
| 43 | ||
| 44 | 19x |
if (is.null(vars)) {
|
| 45 | 15x |
vars <- issue_vars |
| 46 |
} else {
|
|
| 47 | 4x |
vars <- paste0("VarDiff_", vars)
|
| 48 |
} |
|
| 49 | ||
| 50 | 19x |
if (length(issue_vars) == 0 || sum(vars %in% issue_vars) == 0) {
|
| 51 | 7x |
return(df[FALSE, ]) |
| 52 |
} |
|
| 53 | ||
| 54 | 12x |
KEEP <- mapply( |
| 55 | 12x |
FUN = get_issue_dataset, |
| 56 | 12x |
issue = vars, |
| 57 | 12x |
diff = list(diff), |
| 58 | 12x |
SIMPLIFY = FALSE |
| 59 |
) |
|
| 60 | ||
| 61 | 12x |
KEEP <- recursive_reduce(KEEP, rbind) |
| 62 | 12x |
KEEP <- KEEP[!duplicated(KEEP), ] |
| 63 | ||
| 64 | 12x |
if (KEYS_ATT$is_derived) {
|
| 65 | 2x |
df[[KEYS_ATT$value]] <- seq_len(nrow(df)) |
| 66 |
} |
|
| 67 | ||
| 68 | 12x |
keys <- KEYS_ATT$value |
| 69 | ||
| 70 | 12x |
if (any(!keys %in% names(df))) {
|
| 71 | 1x |
stop("df does not contain all variables specified as keys in diff")
|
| 72 |
} |
|
| 73 | ||
| 74 | 11x |
RET <- merge( |
| 75 | 11x |
x = df, |
| 76 | 11x |
y = KEEP, |
| 77 | 11x |
sort = TRUE |
| 78 |
) |
|
| 79 | ||
| 80 | 11x |
RET <- RET[do.call("order", RET[keys]), ]
|
| 81 | ||
| 82 | 11x |
if (KEYS_ATT$is_derived) {
|
| 83 | 2x |
keep_vars <- !names(RET) %in% KEYS_ATT$value |
| 84 | 2x |
RET <- RET[, keep_vars, drop = FALSE] |
| 85 |
} |
|
| 86 | ||
| 87 | 11x |
return(RET) |
| 88 |
} |
|
| 89 | ||
| 90 | ||
| 91 | ||
| 92 | ||
| 93 |
#' get_issue_dataset |
|
| 94 |
#' |
|
| 95 |
#' Internal function used by `diffdf_issuerows` to extract the dataframe |
|
| 96 |
#' from each a target issue. In particular it also strips off any |
|
| 97 |
#' non-key variables |
|
| 98 |
#' @param issue name of issue to extract the dataset from diff |
|
| 99 |
#' @param diff diffdf object which contains issues |
|
| 100 |
#' @keywords internal |
|
| 101 |
get_issue_dataset <- function(issue, diff) {
|
|
| 102 | 20x |
issue_df <- diff[[issue]] |
| 103 | 20x |
keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))]
|
| 104 | 20x |
issue_df[, keep, drop = FALSE] |
| 105 |
} |
| 1 |
#' Pad String |
|
| 2 |
#' |
|
| 3 |
#' Utility function used to replicate `str_pad`. Adds white space to either end |
|
| 4 |
#' of a string to get it to equal the desired length |
|
| 5 |
#' @param x string |
|
| 6 |
#' @param width desired length |
|
| 7 |
#' @keywords internal |
|
| 8 |
string_pad <- function(x, width) {
|
|
| 9 | 1295x |
if (nchar(x) >= width) {
|
| 10 | ! |
return(x) |
| 11 |
} |
|
| 12 | 1295x |
width <- width - nchar(x) |
| 13 | 1295x |
left <- paste0(rep(" ", floor(width / 2)), collapse = "")
|
| 14 | 1295x |
right <- paste0(rep(" ", ceiling(width / 2)), collapse = "")
|
| 15 | 1295x |
paste0(left, x, right, collapse = "") |
| 16 |
} |
|
| 17 | ||
| 18 | ||
| 19 |
#' recursive_reduce |
|
| 20 |
#' |
|
| 21 |
#' Utility function used to replicated `purrr::reduce`. Recursively applies a |
|
| 22 |
#' function to a list of elements until only 1 element remains |
|
| 23 |
#' @param .l list of values to apply a function to |
|
| 24 |
#' @param .f function to apply to each each element of the list in turn. See details. |
|
| 25 |
#' @details |
|
| 26 |
#' This function is essentially performing the following operation: |
|
| 27 |
#' ``` |
|
| 28 |
#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]]) |
|
| 29 |
#' ``` |
|
| 30 |
#' @keywords internal |
|
| 31 |
recursive_reduce <- function(.l, .f) {
|
|
| 32 | 587x |
if (length(.l) != 1) {
|
| 33 | 386x |
.l[[2]] <- .f(.l[[1]], .l[[2]]) |
| 34 | 386x |
return(recursive_reduce(.l[-1], .f)) |
| 35 |
} |
|
| 36 | 201x |
.l[[1]] |
| 37 |
} |
|
| 38 | ||
| 39 |
#' invert |
|
| 40 |
#' |
|
| 41 |
#' Utility function used to replicated `purrr::transpose`. Turns a list inside |
|
| 42 |
#' out. |
|
| 43 |
#' @param x list |
|
| 44 |
#' @keywords internal |
|
| 45 |
invert <- function(x) {
|
|
| 46 | 63x |
x2 <- list() |
| 47 | 63x |
cnames <- names(x) |
| 48 | 63x |
tnames <- names(x[[1]]) |
| 49 | 63x |
for (i in tnames) {
|
| 50 | 189x |
x2[[i]] <- list() |
| 51 | 189x |
for (j in cnames) {
|
| 52 | 567x |
x2[[i]][[j]] <- x[[j]][[i]] |
| 53 |
} |
|
| 54 |
} |
|
| 55 | 63x |
x2 |
| 56 |
} |
|
| 57 | ||
| 58 | ||
| 59 | ||
| 60 | ||
| 61 |
#' as_ascii_table |
|
| 62 |
#' |
|
| 63 |
#' This function takes a `data.frame` and attempts to convert it into |
|
| 64 |
#' a simple ascii format suitable for printing to the screen |
|
| 65 |
#' It is assumed all variable values have a `as.character()` method |
|
| 66 |
#' in order to cast them to character. |
|
| 67 |
#' @param dat Input dataset to convert into a ascii table |
|
| 68 |
#' @param line_prefix Symbols to prefix in front of every line of the table |
|
| 69 |
#' @keywords internal |
|
| 70 |
as_ascii_table <- function(dat, line_prefix = " ") {
|
|
| 71 | 63x |
n_col <- ncol(dat) |
| 72 | 63x |
n_row <- nrow(dat) |
| 73 | ||
| 74 |
## Convert every value to character and crop to a suitable length |
|
| 75 | 63x |
dat_char <- lapply(dat, as_fmt_char) |
| 76 | ||
| 77 | ||
| 78 | 63x |
hold <- list() |
| 79 | 63x |
COLS <- colnames(dat) |
| 80 | ||
| 81 |
### For each column extract core elements (width, values , title) and pad out |
|
| 82 |
### each string to be a suitable length |
|
| 83 | 63x |
for (i in seq_len(n_col)) {
|
| 84 | 189x |
COL <- COLS[i] |
| 85 | 189x |
VALUES <- dat_char[[i]] |
| 86 | ||
| 87 | 189x |
JOINT <- c(COL, VALUES) |
| 88 | 189x |
WIDTH <- max(sapply(JOINT, nchar)) + 2 |
| 89 | ||
| 90 | 189x |
hold[[COL]] <- list() |
| 91 | 189x |
hold[[COL]]$WIDTH <- WIDTH |
| 92 | 189x |
hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH) |
| 93 | 189x |
hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH) |
| 94 |
} |
|
| 95 | ||
| 96 |
### Collapse into a single value per component ( title , values, width ) |
|
| 97 | 63x |
thold <- invert(hold) |
| 98 | 63x |
tvals <- recursive_reduce(thold$VALUES, paste0) |
| 99 | 63x |
thead <- recursive_reduce(thold$HEADER, paste0) |
| 100 | 63x |
twidth <- recursive_reduce(thold$WIDTH, sum) |
| 101 | ||
| 102 |
### Create header and footer lines |
|
| 103 | 63x |
TLINE <- paste0(rep("=", twidth), collapse = "")
|
| 104 | 63x |
LINE <- paste0(rep("-", twidth), collapse = "")
|
| 105 | 63x |
FVALS <- paste0(line_prefix, tvals, collapse = "\n") |
| 106 | ||
| 107 |
### Output table |
|
| 108 | 63x |
paste0( |
| 109 | 63x |
line_prefix, TLINE, "\n", |
| 110 | 63x |
line_prefix, thead, "\n", |
| 111 | 63x |
line_prefix, LINE, "\n", |
| 112 | 63x |
FVALS, "\n", |
| 113 | 63x |
line_prefix, LINE |
| 114 |
) |
|
| 115 |
} |
|
| 116 | ||
| 117 | ||
| 118 |
#' as_character |
|
| 119 |
#' |
|
| 120 |
#' Stub function to enable mocking in unit tests |
|
| 121 |
as_character <- as.character |
|
| 122 | ||
| 123 |
#' Format vector to printable string |
|
| 124 |
#' |
|
| 125 |
#' Coerces a vector of any type into a printable string. The most |
|
| 126 |
#' significant transformation is performed on existing character |
|
| 127 |
#' vectors which will be truncated, have newlines converted |
|
| 128 |
#' to explicit symbols and will be wrapped in quotes if they |
|
| 129 |
#' contain white space. |
|
| 130 |
#' |
|
| 131 |
#' @param x (`vector`) \cr vector to be converted to character |
|
| 132 |
#' @param add_quotes (`logical`) \cr if true will wrap strings that contain |
|
| 133 |
#' whitespace with quotes |
|
| 134 |
#' @param crop_at (`numeric`) \cr specifies the limit at which strings should |
|
| 135 |
#' be truncated to |
|
| 136 |
#' @param ... additional arguments (not currently used) |
|
| 137 |
#' |
|
| 138 |
#' @name as_fmt_char |
|
| 139 |
#' @keywords internal |
|
| 140 |
as_fmt_char <- function(x, ...) {
|
|
| 141 | 292x |
UseMethod("as_fmt_char")
|
| 142 |
} |
|
| 143 | ||
| 144 |
#' @rdname as_fmt_char |
|
| 145 |
#' @export |
|
| 146 |
as_fmt_char.numeric <- function(x, ...) {
|
|
| 147 | 56x |
format(x, digits = 7, justify = "right") |
| 148 |
} |
|
| 149 | ||
| 150 |
#' @rdname as_fmt_char |
|
| 151 |
#' @export |
|
| 152 |
as_fmt_char.NULL <- function(x, ...) {
|
|
| 153 | ! |
"<NULL>" |
| 154 |
} |
|
| 155 | ||
| 156 |
#' @importFrom utils capture.output |
|
| 157 |
#' @rdname as_fmt_char |
|
| 158 |
#' @export |
|
| 159 |
as_fmt_char.list <- function(x, ...) {
|
|
| 160 | 29x |
vapply( |
| 161 | 29x |
x, |
| 162 | 29x |
function(x) {
|
| 163 | 92x |
if (is.numeric(x)) {
|
| 164 | ! |
return(as_fmt_char(x)) |
| 165 |
} |
|
| 166 | 92x |
if (is.character(x) & length(x) == 1) {
|
| 167 | 86x |
return(as_fmt_char(x)) |
| 168 |
} |
|
| 169 | 6x |
as_fmt_char( |
| 170 | 6x |
paste(capture.output(dput(x)), collapse = " "), |
| 171 | 6x |
add_quotes = FALSE |
| 172 |
) |
|
| 173 |
}, |
|
| 174 | 29x |
character(1) |
| 175 |
) |
|
| 176 |
} |
|
| 177 | ||
| 178 |
#' @rdname as_fmt_char |
|
| 179 |
#' @export |
|
| 180 |
as_fmt_char.factor <- function(x, ...) {
|
|
| 181 | 3x |
as_fmt_char(as.character(x)) |
| 182 |
} |
|
| 183 | ||
| 184 |
#' @rdname as_fmt_char |
|
| 185 |
#' @export |
|
| 186 |
as_fmt_char.character <- function(x, add_quotes = TRUE, crop_at = 30, ...) {
|
|
| 187 | 197x |
needs_quotes <- grepl("\\s", x) & add_quotes
|
| 188 | ||
| 189 | 197x |
x[is.na(x)] <- "<NA>" |
| 190 | ||
| 191 |
# Replace \nl \cr with tags to stop print message splitting over |
|
| 192 |
# multiple lines |
|
| 193 | 197x |
x <- gsub("\x0D", "<cr>", x)
|
| 194 | 197x |
x <- gsub("\x0A", "<nl>", x)
|
| 195 | ||
| 196 | 197x |
charlength <- vapply(x, nchar, numeric(1)) |
| 197 | 197x |
x <- substr(x, 1, crop_at) |
| 198 | 197x |
x[charlength > crop_at] <- paste0(x[charlength > crop_at], "...") |
| 199 | ||
| 200 |
# Add enclosing " " around strings with white space so that it can be |
|
| 201 |
# clearly identified in the printed output |
|
| 202 | 197x |
x[needs_quotes] <- paste0('"', x[needs_quotes], '"')
|
| 203 | ||
| 204 | 197x |
x |
| 205 |
} |
|
| 206 | ||
| 207 | ||
| 208 |
#' @rdname as_fmt_char |
|
| 209 |
#' @export |
|
| 210 |
as_fmt_char.default <- function(x, ...) {
|
|
| 211 | 8x |
x_char <- as_character(x) |
| 212 | 8x |
assertthat::assert_that( |
| 213 | 8x |
is.character(x_char), |
| 214 | 8x |
msg = sprintf( |
| 215 | 8x |
"Unable to convert class `'%s'` to character for printing purposes", |
| 216 | 8x |
paste(class(x), collapse = "', '") |
| 217 |
) |
|
| 218 |
) |
|
| 219 | 7x |
as_fmt_char.character(x_char, add_quotes = FALSE) |
| 220 |
} |
|
| 221 | ||
| 222 | ||
| 223 |
#' @rdname as_fmt_char |
|
| 224 |
#' @export |
|
| 225 |
as_fmt_char.POSIXt <- function(x, ...) {
|
|
| 226 | 6x |
x <- format(x, "%Y-%m-%d %H:%M:%S %Z") |
| 227 | 6x |
x[is.na(x)] <- "<NA>" |
| 228 | 6x |
x |
| 229 |
} |
|
| 230 | ||
| 231 | ||
| 232 |
#' get_table |
|
| 233 |
#' |
|
| 234 |
#' Generate nice looking table from a data frame |
|
| 235 |
#' @param dsin dataset |
|
| 236 |
#' @inheritParams print.diffdf |
|
| 237 |
#' @keywords internal |
|
| 238 |
get_table <- function(dsin, row_limit = 10) {
|
|
| 239 | 60x |
if (nrow(dsin) == 0) {
|
| 240 | ! |
return("")
|
| 241 |
} |
|
| 242 | 60x |
if (!is.null(row_limit)) {
|
| 243 | 57x |
assertthat::assert_that( |
| 244 | 57x |
assertthat::is.number(row_limit), |
| 245 | 57x |
row_limit > 0, |
| 246 | 57x |
msg = "row_limit must be a positive integer" |
| 247 |
) |
|
| 248 |
} |
|
| 249 | 60x |
if (is.null(row_limit)) {
|
| 250 | 3x |
display_table <- dsin |
| 251 |
} else {
|
|
| 252 | 57x |
display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1)) |
| 253 |
} |
|
| 254 | ||
| 255 | 60x |
add_message <- if (!is.null(row_limit) && nrow(dsin) > row_limit) {
|
| 256 | 7x |
paste0( |
| 257 | 7x |
"First ", |
| 258 | 7x |
row_limit, |
| 259 | 7x |
" of ", |
| 260 | 7x |
nrow(dsin), |
| 261 | 7x |
" rows are shown in table below" |
| 262 |
) |
|
| 263 |
} else {
|
|
| 264 | 53x |
NULL |
| 265 |
} |
|
| 266 | ||
| 267 | 60x |
msg <- paste( |
| 268 | 60x |
c( |
| 269 | 60x |
add_message, |
| 270 | 60x |
as_ascii_table(display_table) |
| 271 |
), |
|
| 272 | 60x |
collapse = "\n" |
| 273 |
) |
|
| 274 | 60x |
msg |
| 275 |
} |
| 1 |
#' Print diffdf objects |
|
| 2 |
#' |
|
| 3 |
#' Print a nicely formatted version of a diffdf object. |
|
| 4 |
#' |
|
| 5 |
#' @param x A comparison object created by \code{diffdf()}.
|
|
| 6 |
#' @param ... Additional arguments (not used). |
|
| 7 |
#' @param row_limit Maximum number of rows to display in difference tables. |
|
| 8 |
#' Use \code{NULL} to show all rows. Default is 10.
|
|
| 9 |
#' @param as_string Logical. If \code{TRUE}, returns the printed message as an R
|
|
| 10 |
#' character vector instead of printing to the console. Default is \code{FALSE}.
|
|
| 11 |
#' @param file A connection or a character string naming the file to print to. If |
|
| 12 |
#' \code{NULL} (the default), output is printed to the console.
|
|
| 13 |
#' |
|
| 14 |
#' @examples |
|
| 15 |
#' x <- subset(iris, -Species) |
|
| 16 |
#' x[1, 2] <- 5 |
|
| 17 |
#' COMPARE <- diffdf(iris, x) |
|
| 18 |
#' print(COMPARE) |
|
| 19 |
#' print(COMPARE, row_limit = 5) |
|
| 20 |
#' \dontrun{
|
|
| 21 |
#' print(COMPARE, file = "output.txt") |
|
| 22 |
#' } |
|
| 23 |
#' |
|
| 24 |
#' @export |
|
| 25 |
print.diffdf <- function(x, row_limit = 10, as_string = FALSE, file = NULL, ...) {
|
|
| 26 | 33x |
if (!is.null(row_limit)) {
|
| 27 | 32x |
assertthat::assert_that( |
| 28 | 32x |
assertthat::is.number(row_limit), |
| 29 | 32x |
row_limit > 0, |
| 30 | 32x |
msg = "row_limit must be a positive integer" |
| 31 |
) |
|
| 32 |
} |
|
| 33 | 29x |
assertthat::assert_that( |
| 34 | 29x |
assertthat::is.flag(as_string) |
| 35 |
) |
|
| 36 | 27x |
COMPARE <- x |
| 37 | ||
| 38 | 27x |
if (length(COMPARE) == 0) {
|
| 39 | 6x |
outtext <- "No issues were found!\n" |
| 40 |
} else {
|
|
| 41 | 21x |
start_text <- paste0("Differences found between the objects!\n\n")
|
| 42 | 21x |
end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit)) |
| 43 | 21x |
end_text <- paste0(unlist(end_text), collapse = "") |
| 44 | 21x |
outtext <- paste0(start_text, end_text) |
| 45 |
} |
|
| 46 | ||
| 47 | 27x |
string_content <- strsplit(outtext, "\n")[[1]] |
| 48 | 27x |
if (!is.null(file)) {
|
| 49 | 3x |
tryCatch( |
| 50 |
{
|
|
| 51 | 3x |
sink(file) |
| 52 | 3x |
cat(string_content, sep = "\n") |
| 53 | 3x |
sink() |
| 54 |
}, |
|
| 55 | 3x |
warning = function(w) {
|
| 56 | ! |
sink() |
| 57 | ! |
warning(w) |
| 58 |
}, |
|
| 59 | 3x |
error = function(e) {
|
| 60 | ! |
sink() |
| 61 | ! |
stop(e) |
| 62 |
} |
|
| 63 |
) |
|
| 64 | 3x |
return(invisible(COMPARE)) |
| 65 |
} |
|
| 66 | ||
| 67 | 24x |
if (as_string) {
|
| 68 | 9x |
return(string_content) |
| 69 |
} else {
|
|
| 70 | 15x |
cat(outtext) |
| 71 | 15x |
return(invisible(COMPARE)) |
| 72 |
} |
|
| 73 |
} |
| 1 |
#' sort_then_join |
|
| 2 |
#' |
|
| 3 |
#' Convenience function to sort two strings and paste them together |
|
| 4 |
#' @param string1 first string |
|
| 5 |
#' @param string2 second string |
|
| 6 |
#' @keywords internal |
|
| 7 |
sort_then_join <- function(string1, string2) {
|
|
| 8 | 14x |
paste0(sort(c(string1, string2)), collapse = "") |
| 9 |
} |
|
| 10 | ||
| 11 | ||
| 12 |
#' class_merge |
|
| 13 |
#' |
|
| 14 |
#' Convenience function to put all classes an object has into one string |
|
| 15 |
#' @param x an object |
|
| 16 |
#' @keywords internal |
|
| 17 |
class_merge <- function(x) {
|
|
| 18 | 304x |
paste(class(x), collapse = "_") |
| 19 |
} |
|
| 20 | ||
| 21 | ||
| 22 |
get_message <- function(colname, whichdat, totype) {
|
|
| 23 | 12x |
message(paste0( |
| 24 | 12x |
"NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype |
| 25 |
)) |
|
| 26 |
} |
|
| 27 | ||
| 28 | ||
| 29 |
#' get_casted_vector |
|
| 30 |
#' |
|
| 31 |
#' casts a vector depending on its type and input |
|
| 32 |
#' @param colin column to cast |
|
| 33 |
#' @param colname name of vector |
|
| 34 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
| 35 |
#' @importFrom methods is |
|
| 36 |
#' @keywords internal |
|
| 37 |
get_casted_vector <- function(colin, colname, whichdat) {
|
|
| 38 | 24x |
if (is(colin, "factor")) {
|
| 39 | 6x |
get_message(colname, whichdat, "character") |
| 40 | 6x |
return(as.character(colin)) |
| 41 |
} |
|
| 42 | ||
| 43 | 18x |
if (is(colin, "integer")) {
|
| 44 | 6x |
get_message(colname, whichdat, "numeric") |
| 45 | 6x |
return(as.numeric(colin)) |
| 46 |
} |
|
| 47 | ||
| 48 | 12x |
colin |
| 49 |
} |
|
| 50 | ||
| 51 | ||
| 52 | ||
| 53 |
#' get_casted_dataset |
|
| 54 |
#' |
|
| 55 |
#' Internal utility function to loop across a dataset casting all target |
|
| 56 |
#' variables |
|
| 57 |
#' @param df dataset to be casted |
|
| 58 |
#' @param columns columns to be casted |
|
| 59 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
| 60 |
#' @keywords internal |
|
| 61 |
get_casted_dataset <- function(df, columns, whichdat) {
|
|
| 62 | 22x |
for (col in columns) {
|
| 63 | 24x |
df[[col]] <- get_casted_vector(df[[col]], col, whichdat) |
| 64 |
} |
|
| 65 | 22x |
return(df) |
| 66 |
} |
|
| 67 | ||
| 68 | ||
| 69 | ||
| 70 | ||
| 71 | ||
| 72 | ||
| 73 |
#' cast_variables |
|
| 74 |
#' |
|
| 75 |
#' Function to cast datasets columns if they have differing types |
|
| 76 |
#' Restricted to specific cases, currently integer and double, and character and factor |
|
| 77 |
#' @param BASE base dataset |
|
| 78 |
#' @param COMPARE comparison dataset |
|
| 79 |
#' @param ignore_vars Variables not to be considered for casting |
|
| 80 |
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles |
|
| 81 |
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters |
|
| 82 |
#' @keywords internal |
|
| 83 |
cast_variables <- function( |
|
| 84 |
BASE, |
|
| 85 |
COMPARE, |
|
| 86 |
ignore_vars = NULL, |
|
| 87 |
cast_integers = FALSE, |
|
| 88 |
cast_factors = FALSE |
|
| 89 |
) {
|
|
| 90 | ||
| 91 | 13x |
allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)]
|
| 92 | ||
| 93 | 13x |
BASE_class <- data.frame( |
| 94 | 13x |
class_BASE = sapply(BASE, class_merge), |
| 95 | 13x |
colname = names(BASE), |
| 96 | 13x |
stringsAsFactors = FALSE |
| 97 |
) |
|
| 98 | 13x |
BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE] |
| 99 | ||
| 100 | ||
| 101 | 13x |
COMPARE_class <- data.frame( |
| 102 | 13x |
class_COMPARE = sapply(COMPARE, class_merge), |
| 103 | 13x |
colname = names(COMPARE), |
| 104 | 13x |
stringsAsFactors = FALSE |
| 105 |
) |
|
| 106 | 13x |
COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE] |
| 107 | ||
| 108 | 13x |
common_class <- merge( |
| 109 | 13x |
x = BASE_class, |
| 110 | 13x |
y = COMPARE_class, |
| 111 | 13x |
by = "colname" |
| 112 |
) |
|
| 113 | ||
| 114 | ||
| 115 | 13x |
diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE] |
| 116 | ||
| 117 | ||
| 118 | 13x |
diff_class$classmerge <- mapply( |
| 119 | 13x |
sort_then_join, |
| 120 | 13x |
diff_class$class_COMPARE, |
| 121 | 13x |
diff_class$class_BASE |
| 122 |
) |
|
| 123 | ||
| 124 | ||
| 125 | 13x |
cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE] |
| 126 | ||
| 127 | ||
| 128 | 13x |
DATASETS <- list( |
| 129 | 13x |
"BASE" = BASE, |
| 130 | 13x |
"COMPARE" = COMPARE |
| 131 |
) |
|
| 132 | ||
| 133 | 13x |
if (nrow(cast_columns) == 0) {
|
| 134 | 2x |
return(DATASETS) |
| 135 |
} |
|
| 136 | ||
| 137 | ||
| 138 | 11x |
for (i in names(DATASETS)) {
|
| 139 | 22x |
DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i) |
| 140 |
} |
|
| 141 | ||
| 142 | 11x |
return(DATASETS) |
| 143 |
} |
| 1 |
#' is_variable_different |
|
| 2 |
#' |
|
| 3 |
#' This subsets the data set on the variable name, picks out differences and returns a `tibble` |
|
| 4 |
#' of differences for the given variable |
|
| 5 |
#' @importFrom tibble as_tibble |
|
| 6 |
#' @param variablename name of variable being compared |
|
| 7 |
#' @param keynames name of keys |
|
| 8 |
#' @param datain Inputted dataset with base and compare vectors |
|
| 9 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 10 |
#' @return A boolean vector which is T if target and current are different |
|
| 11 |
#' @keywords internal |
|
| 12 |
is_variable_different <- function(variablename, keynames, datain, ...) {
|
|
| 13 | 931x |
xvar <- paste0(variablename, ".x") |
| 14 | 931x |
yvar <- paste0(variablename, ".y") |
| 15 | ||
| 16 | 931x |
assertthat::assert_that( |
| 17 | 931x |
xvar %in% names(datain) && yvar %in% names(datain), |
| 18 | 931x |
msg = "Variable does not exist within input dataset" |
| 19 |
) |
|
| 20 | ||
| 21 | 931x |
target <- datain[[xvar]] |
| 22 | 931x |
current <- datain[[yvar]] |
| 23 | 931x |
outvect <- find_difference(target, current, ...) |
| 24 | ||
| 25 | 931x |
datain[["VARIABLE"]] <- variablename |
| 26 | ||
| 27 | 931x |
names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE")
|
| 28 | ||
| 29 | 931x |
x <- as_tibble( |
| 30 | 931x |
subset( |
| 31 | 931x |
datain, |
| 32 | 931x |
outvect, |
| 33 | 931x |
select = c("VARIABLE", keynames, "BASE", "COMPARE")
|
| 34 |
) |
|
| 35 |
) |
|
| 36 | ||
| 37 | 931x |
return(x) |
| 38 |
} |
|
| 39 | ||
| 40 |
#' compare_vectors |
|
| 41 |
#' |
|
| 42 |
#' Compare two vectors looking for differences |
|
| 43 |
#' |
|
| 44 |
#' @param target the base vector |
|
| 45 |
#' @param current a vector to compare target to |
|
| 46 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 47 |
#' @keywords internal |
|
| 48 |
compare_vectors <- function(target, current, ...) {
|
|
| 49 | 964x |
UseMethod("compare_vectors")
|
| 50 |
} |
|
| 51 | ||
| 52 | ||
| 53 |
#' find_difference |
|
| 54 |
#' |
|
| 55 |
#' This determines if two vectors are different. It expects vectors of the same |
|
| 56 |
#' length and type, and is intended to be used after checks have already been done |
|
| 57 |
#' Initially picks out any `NA`'s (matching `NA`'s count as a match) |
|
| 58 |
#' Then compares remaining vector |
|
| 59 |
#' |
|
| 60 |
#' @param target the base vector |
|
| 61 |
#' @param current a vector to compare target to |
|
| 62 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 63 |
#' @keywords internal |
|
| 64 |
find_difference <- function(target, current, ...) {
|
|
| 65 | 967x |
if (length(target) != length(current)) {
|
| 66 | 2x |
warning("Inputs are not of the same length")
|
| 67 | 2x |
return(NULL) |
| 68 |
} |
|
| 69 | ||
| 70 | 965x |
if (is.null(target) || is.null(current)) {
|
| 71 | 1x |
return(is.null(target) != is.null(current)) |
| 72 |
} |
|
| 73 | ||
| 74 |
### Initalise output, assume problem unless evidence otherwise |
|
| 75 | 964x |
return_vector <- rep(TRUE, length(target)) |
| 76 | ||
| 77 | 964x |
nas_t <- is.na(target) |
| 78 | 964x |
nas_c <- is.na(current) |
| 79 | ||
| 80 |
## compare missing values |
|
| 81 | 964x |
nacompare <- nas_t != nas_c |
| 82 | 964x |
naselect <- nas_t | nas_c |
| 83 | 964x |
return_vector[naselect] <- nacompare[naselect] |
| 84 | ||
| 85 |
## compare non-missing values |
|
| 86 | 964x |
selectvector <- as.logical((!nas_t) * (!nas_c)) |
| 87 | ||
| 88 | 964x |
comparevect <- compare_vectors( |
| 89 | 964x |
target[selectvector], |
| 90 | 964x |
current[selectvector], |
| 91 |
... |
|
| 92 |
) |
|
| 93 | ||
| 94 | 964x |
return_vector[selectvector] <- comparevect |
| 95 | ||
| 96 | 964x |
return(return_vector) |
| 97 |
} |
|
| 98 | ||
| 99 | ||
| 100 | ||
| 101 | ||
| 102 | ||
| 103 | ||
| 104 | ||
| 105 |
#' compare_vectors.default |
|
| 106 |
#' |
|
| 107 |
#' Default method, if the vector is not numeric or factor. Basic comparison |
|
| 108 |
#' @param target the base vector |
|
| 109 |
#' @param current a vector to compare target to |
|
| 110 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 111 |
#' @keywords internal |
|
| 112 |
compare_vectors.default <- function(target, current, ...) {
|
|
| 113 | 411x |
target != current |
| 114 |
} |
|
| 115 | ||
| 116 | ||
| 117 | ||
| 118 | ||
| 119 |
#' compare_vectors.factor |
|
| 120 |
#' |
|
| 121 |
#' Compares factors. Sets them as character and then compares |
|
| 122 |
#' @param target the base vector |
|
| 123 |
#' @param current a vector to compare target to |
|
| 124 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
| 125 |
#' @keywords internal |
|
| 126 |
compare_vectors.factor <- function(target, current, ...) {
|
|
| 127 | 83x |
as.character(target) != as.character(current) |
| 128 |
} |
|
| 129 | ||
| 130 | ||
| 131 | ||
| 132 | ||
| 133 | ||
| 134 |
#' compare_vectors.numeric |
|
| 135 |
#' |
|
| 136 |
#' This is a modified version of the all.equal function |
|
| 137 |
#' which returns a vector rather than a message |
|
| 138 |
#' @param target the base vector |
|
| 139 |
#' @param current a vector to compare target to |
|
| 140 |
#' @param tolerance Level of tolerance for differences between two variables |
|
| 141 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
| 142 |
#' @param ... Not used |
|
| 143 |
#' @keywords internal |
|
| 144 |
compare_vectors.numeric <- function( |
|
| 145 |
target, |
|
| 146 |
current, |
|
| 147 |
tolerance = sqrt(.Machine$double.eps), |
|
| 148 |
scale = NULL, |
|
| 149 |
... |
|
| 150 |
) {
|
|
| 151 | ||
| 152 | 470x |
out <- target == current |
| 153 | ||
| 154 | 470x |
if (all(out)) {
|
| 155 | 439x |
return(!out) |
| 156 |
} |
|
| 157 | ||
| 158 | 31x |
if (is.integer(target) || is.integer(current)) {
|
| 159 | 7x |
target <- as.double(target) |
| 160 | 7x |
current <- as.double(current) |
| 161 |
} |
|
| 162 | ||
| 163 | 31x |
xy <- abs(target - current) |
| 164 | ||
| 165 | 31x |
if (!is.null(scale)) {
|
| 166 | 4x |
xy <- xy / scale |
| 167 |
} |
|
| 168 | ||
| 169 | 31x |
return(xy > tolerance) |
| 170 |
} |
|
| 171 | ||
| 172 |
#' compare_vectors.int64 |
|
| 173 |
#' |
|
| 174 |
#' Handle int64 vectors. Uses numeric comparison |
|
| 175 |
#' @param target the base vector |
|
| 176 |
#' @param current a vector to compare target to |
|
| 177 |
#' @param tolerance Level of tolerance for differences between two variables |
|
| 178 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
| 179 |
#' @param ... Not used |
|
| 180 |
#' @keywords internal |
|
| 181 |
compare_vectors.integer64 <- function( |
|
| 182 |
target, |
|
| 183 |
current, |
|
| 184 |
tolerance = sqrt(.Machine$double.eps), |
|
| 185 |
scale = NULL, |
|
| 186 |
... |
|
| 187 |
) {
|
|
| 188 | 4x |
compare_vectors.numeric(target, current, tolerance, scale) |
| 189 |
} |
| 1 |
#' identify_extra_rows |
|
| 2 |
#' |
|
| 3 |
#' Identifies rows that are in a baseline dataset but not in a comparator dataset |
|
| 4 |
#' @param DS1 Baseline dataset (data frame) |
|
| 5 |
#' @param DS2 Comparator dataset (data frame) |
|
| 6 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
| 7 |
#' @keywords internal |
|
| 8 |
identify_extra_rows <- function(DS1, DS2, KEYS) {
|
|
| 9 | 238x |
if (nrow(DS2) == 0 || nrow(DS1) == 0) {
|
| 10 | 10x |
return(DS1[, KEYS, drop = FALSE]) |
| 11 |
} |
|
| 12 | 228x |
DS2[["..FLAG.."]] <- "Y" |
| 13 | 228x |
dat <- merge( |
| 14 | 228x |
subset(DS1, select = KEYS), |
| 15 | 228x |
subset(DS2, select = c(KEYS, "..FLAG..")), |
| 16 | 228x |
by = KEYS, all.x = TRUE, |
| 17 | 228x |
sort = TRUE |
| 18 |
) |
|
| 19 | 228x |
dat <- dat[do.call("order", dat[KEYS]), ]
|
| 20 | ||
| 21 | 228x |
dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE] |
| 22 |
} |
|
| 23 | ||
| 24 | ||
| 25 | ||
| 26 |
#' identify_extra_cols |
|
| 27 |
#' |
|
| 28 |
#' Identifies columns that are in a baseline dataset but not in a comparator dataset |
|
| 29 |
#' @param DS1 Baseline dataset (data frame) |
|
| 30 |
#' @param DS2 Comparator dataset (data frame) |
|
| 31 |
#' @importFrom tibble tibble |
|
| 32 |
#' @keywords internal |
|
| 33 |
identify_extra_cols <- function(DS1, DS2) {
|
|
| 34 | 238x |
match.cols <- sapply(names(DS1), "%in%", names(DS2)) |
| 35 | 238x |
assertthat::assert_that( |
| 36 | 238x |
all(is.logical(match.cols)), |
| 37 | 238x |
msg = "Assumption of logical return type is not true" |
| 38 |
) |
|
| 39 | 238x |
tibble( |
| 40 | 238x |
COLUMNS = names(DS1)[!match.cols] |
| 41 |
) |
|
| 42 |
} |
|
| 43 | ||
| 44 | ||
| 45 | ||
| 46 | ||
| 47 | ||
| 48 | ||
| 49 | ||
| 50 |
#' identify_matching_cols |
|
| 51 |
#' |
|
| 52 |
#' Identifies columns with the same name in two data frames |
|
| 53 |
#' @param DS1 Input dataset 1 (data frame) |
|
| 54 |
#' @param DS2 Input dataset 2 (data frame) |
|
| 55 |
#' @param EXCLUDE Columns to ignore |
|
| 56 |
#' @keywords internal |
|
| 57 |
identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") {
|
|
| 58 | 796x |
match_cols <- sapply(names(DS1), "%in%", names(DS2)) |
| 59 | 796x |
exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE) |
| 60 | 796x |
names(DS1)[match_cols & !exclude_cols] |
| 61 |
} |
|
| 62 | ||
| 63 | ||
| 64 | ||
| 65 | ||
| 66 | ||
| 67 |
#' identify_unsupported_cols |
|
| 68 |
#' |
|
| 69 |
#' Identifies any columns for which the package is not setup to handle |
|
| 70 |
#' @param dsin input dataset |
|
| 71 |
#' @keywords internal |
|
| 72 |
identify_unsupported_cols <- function(dsin) {
|
|
| 73 | 248x |
dat <- subset( |
| 74 | 248x |
identify_properties(dsin), |
| 75 | 248x |
select = c("VARIABLE", "MODE")
|
| 76 |
) |
|
| 77 | ||
| 78 | 248x |
dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE]
|
| 79 |
} |
|
| 80 | ||
| 81 | ||
| 82 | ||
| 83 |
#' identify_mode_differences |
|
| 84 |
#' |
|
| 85 |
#' Identifies any mode differences between two data frames |
|
| 86 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 87 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 88 |
#' @keywords internal |
|
| 89 |
identify_mode_differences <- function(BASE, COMP) {
|
|
| 90 | 424x |
matching_cols <- identify_matching_cols(BASE, COMP) |
| 91 | ||
| 92 | 424x |
dat <- merge( |
| 93 | 424x |
x = identify_properties(BASE), |
| 94 | 424x |
y = identify_properties(COMP), |
| 95 | 424x |
by = "VARIABLE", |
| 96 | 424x |
all = TRUE, |
| 97 | 424x |
suffixes = c(".BASE", ".COMP"),
|
| 98 | 424x |
sort = TRUE |
| 99 |
) |
|
| 100 | 424x |
dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP"))
|
| 101 | ||
| 102 | 424x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
| 103 | 424x |
KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]] |
| 104 | ||
| 105 | 424x |
dat[KEEP1 & KEEP2, , drop = FALSE] |
| 106 |
} |
|
| 107 | ||
| 108 | ||
| 109 | ||
| 110 |
#' identify_class_differences |
|
| 111 |
#' |
|
| 112 |
#' Identifies any class differences between two data frames |
|
| 113 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 114 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 115 |
#' @keywords internal |
|
| 116 |
identify_class_differences <- function(BASE, COMP) {
|
|
| 117 | 124x |
matching_cols <- identify_matching_cols(BASE, COMP) |
| 118 | ||
| 119 | 124x |
dat <- merge( |
| 120 | 124x |
x = identify_properties(BASE), |
| 121 | 124x |
y = identify_properties(COMP), |
| 122 | 124x |
by = "VARIABLE", |
| 123 | 124x |
all = TRUE, |
| 124 | 124x |
sort = TRUE, |
| 125 | 124x |
suffixes = c(".BASE", ".COMP")
|
| 126 |
) |
|
| 127 | ||
| 128 | 124x |
dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP"))
|
| 129 | ||
| 130 | 124x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
| 131 | 124x |
KEEP2 <- !mapply( |
| 132 | 124x |
identical, |
| 133 | 124x |
dat[["CLASS.BASE"]], |
| 134 | 124x |
dat[["CLASS.COMP"]] |
| 135 |
) |
|
| 136 | ||
| 137 | 124x |
dat[KEEP1 & KEEP2, , drop = FALSE] |
| 138 |
} |
|
| 139 | ||
| 140 | ||
| 141 | ||
| 142 |
#' Identify differences in attributes |
|
| 143 |
#' |
|
| 144 |
#' Identifies any attribute differences between two data frames |
|
| 145 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 146 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 147 |
#' @param exclude_cols Columns to exclude from comparison |
|
| 148 |
#' @importFrom tibble tibble |
|
| 149 |
#' @keywords internal |
|
| 150 |
identify_att_differences <- function(BASE, COMP, exclude_cols = "") {
|
|
| 151 | 129x |
matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols) |
| 152 | ||
| 153 | 129x |
PROPS <- merge( |
| 154 | 129x |
x = identify_properties(BASE), |
| 155 | 129x |
y = identify_properties(COMP), |
| 156 | 129x |
by = "VARIABLE", |
| 157 | 129x |
all = TRUE, |
| 158 | 129x |
sort = TRUE, |
| 159 | 129x |
suffixes = c(".BASE", ".COMP")
|
| 160 |
) |
|
| 161 | ||
| 162 | 129x |
PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP"))
|
| 163 | ||
| 164 | 129x |
PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE] |
| 165 | ||
| 166 | ||
| 167 |
### Setup dummy return value |
|
| 168 | 129x |
RETURN <- tibble( |
| 169 | 129x |
VARIABLE = character(), |
| 170 | 129x |
ATTR_NAME = character(), |
| 171 | 129x |
VALUES.BASE = list(), |
| 172 | 129x |
VALUES.COMP = list() |
| 173 |
) |
|
| 174 | ||
| 175 | 129x |
for (i in PROPS[["VARIABLE"]]) {
|
| 176 | 1179x |
PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE] |
| 177 | ||
| 178 |
### Get a vector of all available attributes across both variables |
|
| 179 | 1179x |
ATTRIB_NAMES <- unique(c( |
| 180 | 1179x |
names(PROPS_filt[["ATTRIBS.BASE"]][[1]]), |
| 181 | 1179x |
names(PROPS_filt[["ATTRIBS.COMP"]][[1]]) |
| 182 |
)) |
|
| 183 | ||
| 184 |
### If variable has no attributes move onto the next variable |
|
| 185 | 869x |
if (is.null(ATTRIB_NAMES)) next() |
| 186 | ||
| 187 |
### Loop over each attribute checking if they are identical and outputing |
|
| 188 |
### anyones that arn't |
|
| 189 | 310x |
for (j in ATTRIB_NAMES) {
|
| 190 | 532x |
ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j] |
| 191 | 532x |
ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j] |
| 192 | ||
| 193 | 532x |
if (!identical(ATTRIB_BASE, ATTRIB_COMP)) {
|
| 194 | 65x |
ATT_DIFFS <- tibble( |
| 195 | 65x |
VARIABLE = i, |
| 196 | 65x |
ATTR_NAME = j, |
| 197 | 65x |
VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE), |
| 198 | 65x |
VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP) |
| 199 |
) |
|
| 200 | ||
| 201 | 65x |
RETURN <- rbind(RETURN, ATT_DIFFS) |
| 202 |
} |
|
| 203 |
} |
|
| 204 |
} |
|
| 205 | 129x |
return(RETURN) |
| 206 |
} |
|
| 207 | ||
| 208 | ||
| 209 | ||
| 210 | ||
| 211 | ||
| 212 |
#' identify_differences |
|
| 213 |
#' |
|
| 214 |
#' Compares each column within 2 datasets to identify any values which they |
|
| 215 |
#' mismatch on. |
|
| 216 |
#' @param BASE Base dataset for comparison (data.frame) |
|
| 217 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
| 218 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
| 219 |
#' @param exclude_cols Columns to exclude from comparison |
|
| 220 |
#' @param tolerance Level of tolerance for numeric differences between two variables |
|
| 221 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
| 222 |
#' @keywords internal |
|
| 223 |
identify_differences <- function( |
|
| 224 |
BASE, |
|
| 225 |
COMP, |
|
| 226 |
KEYS, |
|
| 227 |
exclude_cols, |
|
| 228 |
tolerance = sqrt(.Machine$double.eps), |
|
| 229 |
scale = NULL |
|
| 230 |
) {
|
|
| 231 | ||
| 232 | 119x |
matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols)) |
| 233 | ||
| 234 | 119x |
if (length(matching_cols) == 0) {
|
| 235 | 2x |
return(tibble()) |
| 236 |
} |
|
| 237 | ||
| 238 | 117x |
DAT <- merge( |
| 239 | 117x |
x = BASE, |
| 240 | 117x |
y = COMP, |
| 241 | 117x |
by = KEYS, |
| 242 | 117x |
suffixes = c(".x", ".y"),
|
| 243 | 117x |
sort = TRUE |
| 244 |
) |
|
| 245 | 117x |
if (nrow(DAT) == 0) {
|
| 246 | 4x |
return(tibble()) |
| 247 |
} |
|
| 248 | 113x |
DAT <- DAT[do.call("order", DAT[KEYS]), ]
|
| 249 | ||
| 250 | 113x |
matching_list <- mapply( |
| 251 | 113x |
is_variable_different, |
| 252 | 113x |
matching_cols, |
| 253 | 113x |
MoreArgs = list( |
| 254 | 113x |
keynames = KEYS, |
| 255 | 113x |
datain = DAT, |
| 256 | 113x |
tolerance = tolerance, |
| 257 | 113x |
scale = scale |
| 258 |
), |
|
| 259 | 113x |
SIMPLIFY = FALSE |
| 260 |
) |
|
| 261 | ||
| 262 | 113x |
matching_list |
| 263 |
} |
|
| 264 | ||
| 265 | ||
| 266 | ||
| 267 | ||
| 268 | ||
| 269 | ||
| 270 | ||
| 271 | ||
| 272 | ||
| 273 |
#' identify_properties |
|
| 274 |
#' |
|
| 275 |
#' Returns a dataframe of metadata for a given dataset. |
|
| 276 |
#' Returned values include variable names , class , mode , type & attributes |
|
| 277 |
#' @param dsin input dataframe that you want to get the metadata from |
|
| 278 |
#' @importFrom tibble tibble |
|
| 279 |
#' @keywords internal |
|
| 280 |
identify_properties <- function(dsin) {
|
|
| 281 |
### If missing or null return empty dataset |
|
| 282 | 1602x |
if (is.null(dsin)) {
|
| 283 | ! |
x <- tibble( |
| 284 | ! |
VARIABLE = character(), |
| 285 | ! |
CLASS = list(), |
| 286 | ! |
MODE = character(), |
| 287 | ! |
TYPE = character(), |
| 288 | ! |
ATTRIBS = list() |
| 289 |
) |
|
| 290 | ! |
return(x) |
| 291 |
} |
|
| 292 | ||
| 293 | 1602x |
tibble( |
| 294 | 1602x |
VARIABLE = names(dsin), |
| 295 | 1602x |
CLASS = lapply(dsin, class), |
| 296 | 1602x |
MODE = sapply(dsin, mode), |
| 297 | 1602x |
TYPE = sapply(dsin, typeof), |
| 298 | 1602x |
ATTRIBS = lapply(dsin, attributes) |
| 299 |
) |
|
| 300 |
} |
|
| 301 | ||
| 302 | ||
| 303 |
#' Find column ordering differences |
|
| 304 |
#' |
|
| 305 |
#' Compares two datasets and outputs a table listing any differences in the column |
|
| 306 |
#' orders between the two datasets. Columns that are not contained within both |
|
| 307 |
#' are ignored however column ordering is derived prior to removing these columns. |
|
| 308 |
#' |
|
| 309 |
#' @param BASE (`data.frame`)\cr Base dataset for comparison |
|
| 310 |
#' @param COMP (`data.frame`)\cr Comparator dataset to compare base against |
|
| 311 |
#' @keywords internal |
|
| 312 |
identify_column_order_differences <- function(BASE, COMP) {
|
|
| 313 | 8x |
base_cols <- tibble( |
| 314 | 8x |
COLUMN = names(BASE), |
| 315 | 8x |
"BASE-INDEX" = seq_along(names(BASE)) |
| 316 |
) |
|
| 317 | 8x |
comp_cols <- tibble( |
| 318 | 8x |
COLUMN = names(COMP), |
| 319 | 8x |
"COMPARE-INDEX" = seq_along(names(COMP)) |
| 320 |
) |
|
| 321 | 8x |
col_index <- merge( |
| 322 | 8x |
base_cols, |
| 323 | 8x |
comp_cols, |
| 324 | 8x |
by = c("COLUMN"),
|
| 325 | 8x |
all = TRUE, |
| 326 | 8x |
sort = FALSE |
| 327 |
) |
|
| 328 | 8x |
keep_rows <- col_index[["BASE-INDEX"]] != col_index[["COMPARE-INDEX"]] |
| 329 | 8x |
keep_rows[is.na(keep_rows)] <- FALSE |
| 330 | 8x |
col_index[keep_rows, , drop = FALSE] |
| 331 |
} |
| 1 |
#' factor_to_character |
|
| 2 |
#' |
|
| 3 |
#' Takes a dataframe and converts any factor variables to character |
|
| 4 |
#' @param dsin input dataframe |
|
| 5 |
#' @param vars variables to consider for conversion. Default NULL will consider |
|
| 6 |
#' every variable within the dataset |
|
| 7 |
#' @keywords internal |
|
| 8 |
factor_to_character <- function(dsin, vars = NULL) {
|
|
| 9 | ! |
if (is.null(vars)) vars <- names(dsin) |
| 10 | ||
| 11 | 238x |
for (var in vars) {
|
| 12 | 252x |
if (is.factor(dsin[[var]])) {
|
| 13 | ! |
dsin[[var]] <- as.character(dsin[[var]]) |
| 14 |
} |
|
| 15 |
} |
|
| 16 | 238x |
return(dsin) |
| 17 |
} |
|
| 18 | ||
| 19 | ||
| 20 | ||
| 21 | ||
| 22 |
#' has_unique_rows |
|
| 23 |
#' |
|
| 24 |
#' Check if a data sets rows are unique |
|
| 25 |
#' @param DAT input data set (data frame) |
|
| 26 |
#' @param KEYS Set of keys which should be unique |
|
| 27 |
#' @keywords internal |
|
| 28 |
has_unique_rows <- function(DAT, KEYS) {
|
|
| 29 | 249x |
DUPS <- duplicated(subset(DAT, select = KEYS)) |
| 30 | 249x |
NDUPS <- sum(DUPS) |
| 31 | 249x |
return(NDUPS == 0) |
| 32 |
} |
|
| 33 | ||
| 34 |
#' convert_to_issue |
|
| 35 |
#' |
|
| 36 |
#' converts the count value into the correct issue format |
|
| 37 |
#' @param datin data inputted |
|
| 38 |
#' @importFrom tibble rownames_to_column |
|
| 39 |
#' @keywords internal |
|
| 40 |
convert_to_issue <- function(datin) {
|
|
| 41 | 113x |
datin_tibble <- tibble( |
| 42 | 113x |
`Variable` = names(datin), |
| 43 | 113x |
`No of Differences` = datin |
| 44 |
) |
|
| 45 | ||
| 46 | 113x |
datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE] |
| 47 | 113x |
return(datin_tibble_reduced) |
| 48 |
} |
|
| 49 | ||
| 50 |
#' Describe the datasets being compared |
|
| 51 |
#' |
|
| 52 |
#' This function is used to produce a basic summary table of the core |
|
| 53 |
#' features of the two `data.frame`'s being compared. |
|
| 54 |
#' @param base (`data.frame`)\cr base dataset to be described |
|
| 55 |
#' @param comp (`data.frame`)\cr comparison dataset to be described |
|
| 56 |
#' @param base_name (`character`)\cr name of the base dataset |
|
| 57 |
#' @param comp_name (`character`)\cr name of the comparison dataset |
|
| 58 |
#' @keywords internal |
|
| 59 |
describe_dataframe <- function(base, comp, base_name, comp_name) {
|
|
| 60 | 131x |
tibble( |
| 61 | 131x |
PROPERTY = list( |
| 62 | 131x |
"Name", |
| 63 | 131x |
"Class", |
| 64 | 131x |
"Rows(#)", |
| 65 | 131x |
"Columns(#)" |
| 66 |
), |
|
| 67 | 131x |
BASE = c( |
| 68 | 131x |
base_name, |
| 69 | 131x |
paste(class(base), collapse = ", "), |
| 70 | 131x |
as.character(nrow(base)), |
| 71 | 131x |
as.character(ncol(base)) |
| 72 |
), |
|
| 73 | 131x |
COMP = c( |
| 74 | 131x |
comp_name, |
| 75 | 131x |
paste(class(comp), collapse = ", "), |
| 76 | 131x |
as.character(nrow(comp)), |
| 77 | 131x |
as.character(ncol(comp)) |
| 78 |
) |
|
| 79 |
) |
|
| 80 |
} |
| 1 |
#' construct_issue |
|
| 2 |
#' |
|
| 3 |
#' Make an s3 object with class issue and possible additional class, |
|
| 4 |
#' and assign other arguments to attributes |
|
| 5 |
#' @param value the value of the object |
|
| 6 |
#' @param message the value of the message attribute |
|
| 7 |
#' @param add_class additional class to add |
|
| 8 |
#' @keywords internal |
|
| 9 |
construct_issue <- function(value, message, add_class = NULL) {
|
|
| 10 | 2271x |
x <- value |
| 11 | ||
| 12 |
### If nothing has been provided return nothing ! |
|
| 13 | 2271x |
if (nrow(x) == 0) {
|
| 14 | 1979x |
return(NULL) |
| 15 |
} |
|
| 16 | ||
| 17 | 292x |
class(x) <- c(add_class, "issue", class(x)) |
| 18 | 292x |
attributes(x)[["message"]] <- message |
| 19 | 292x |
return(x) |
| 20 |
} |
|
| 21 | ||
| 22 | ||
| 23 |
#' get_issue_message |
|
| 24 |
#' |
|
| 25 |
#' Simple function to grab the issue message |
|
| 26 |
#' @param object inputted object of class issue |
|
| 27 |
#' @param ... other arguments |
|
| 28 |
#' @keywords internal |
|
| 29 |
get_issue_message <- function(object, ...) {
|
|
| 30 | 152x |
return(attr(object, "message")) |
| 31 |
} |
|
| 32 | ||
| 33 | ||
| 34 |
#' get_print_message |
|
| 35 |
#' |
|
| 36 |
#' Get the required text depending on type of issue |
|
| 37 |
#' @param object inputted object of class issue |
|
| 38 |
#' @param ... other arguments |
|
| 39 |
#' @keywords internal |
|
| 40 |
get_print_message <- function(object, ...) {
|
|
| 41 | 60x |
UseMethod("get_print_message", object)
|
| 42 |
} |
|
| 43 | ||
| 44 | ||
| 45 |
#' get_print_message.default |
|
| 46 |
#' |
|
| 47 |
#' Errors, as this should only ever be given an issue |
|
| 48 |
#' @param object issue |
|
| 49 |
#' @param ... Not used |
|
| 50 |
#' @keywords internal |
|
| 51 |
get_print_message.default <- function(object, ...) {
|
|
| 52 | ! |
stop("Error: An issue has not been provided to this function!")
|
| 53 |
} |
|
| 54 | ||
| 55 | ||
| 56 |
#' get_print_message.issue |
|
| 57 |
#' |
|
| 58 |
#' Get text from a basic issue, based on the class of the value of the issue |
|
| 59 |
#' |
|
| 60 |
#' @param object an object of class issue_basic |
|
| 61 |
#' @inheritParams print.diffdf |
|
| 62 |
#' @keywords internal |
|
| 63 |
get_print_message.issue <- function(object, row_limit, ...) {
|
|
| 64 | 60x |
paste( |
| 65 | 60x |
c( |
| 66 | 60x |
attr(object, "message"), |
| 67 | 60x |
get_table(object, row_limit = row_limit), |
| 68 | 60x |
"\n\n" |
| 69 |
), |
|
| 70 | 60x |
collapse = "\n" |
| 71 |
) |
|
| 72 |
} |