1 |
#' diffdf |
|
2 |
#' @description |
|
3 |
#' Compares 2 dataframes and outputs any differences. |
|
4 |
#' @param base input dataframe |
|
5 |
#' @param compare comparison dataframe |
|
6 |
#' @param keys vector of variables (as strings) that defines a unique row in |
|
7 |
#' the base and compare dataframes |
|
8 |
#' @param strict_numeric Flag for strict numeric to numeric comparisons |
|
9 |
#' (default = TRUE). If False diffdf will cast integer to double where |
|
10 |
#' required for comparisons. Note that variables specified in the keys |
|
11 |
#' will never be casted. |
|
12 |
#' @param strict_factor Flag for strict factor to character comparisons |
|
13 |
#' (default = TRUE). If False diffdf will cast factors to characters where |
|
14 |
#' required for comparisons. Note that variables specified in the keys will |
|
15 |
#' never be casted. |
|
16 |
#' @param suppress_warnings Do you want to suppress warnings? (logical) |
|
17 |
#' @param file Location and name of a text file to output the results to. |
|
18 |
#' Setting to NULL will cause no file to be produced. |
|
19 |
#' @param tolerance Set tolerance for numeric comparisons. Note that |
|
20 |
#' comparisons fail if (x-y)/scale > tolerance. |
|
21 |
#' @param scale Set scale for numeric comparisons. Note that comparisons fail |
|
22 |
#' if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient |
|
23 |
#' version of scale = 1. |
|
24 |
#' @examples |
|
25 |
#' x <- subset( iris, -Species) |
|
26 |
#' x[1,2] <- 5 |
|
27 |
#' COMPARE <- diffdf( iris, x) |
|
28 |
#' print( COMPARE ) |
|
29 |
#' |
|
30 |
#' #### Sample data frames |
|
31 |
#' |
|
32 |
#' DF1 <- data.frame( |
|
33 |
#' id = c(1,2,3,4,5,6), |
|
34 |
#' v1 = letters[1:6], |
|
35 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA) |
|
36 |
#' ) |
|
37 |
#' |
|
38 |
#' DF2 <- data.frame( |
|
39 |
#' id = c(1,2,3,4,5,7), |
|
40 |
#' v1 = letters[1:6], |
|
41 |
#' v2 = c(NA , NA , 1 , 2 , NA , NA), |
|
42 |
#' v3 = c(NA , NA , 1 , 2 , NA , 4) |
|
43 |
#' ) |
|
44 |
#' |
|
45 |
#' diffdf(DF1 , DF1 , keys = "id") |
|
46 |
#' |
|
47 |
#' # We can control matching with scale/location for example: |
|
48 |
#' |
|
49 |
#' DF1 <- data.frame( |
|
50 |
#' id = c(1,2,3,4,5,6), |
|
51 |
#' v1 = letters[1:6], |
|
52 |
#' v2 = c(1,2,3,4,5,6) |
|
53 |
#' ) |
|
54 |
#' DF2 <- data.frame( |
|
55 |
#' id = c(1,2,3,4,5,6), |
|
56 |
#' v1 = letters[1:6], |
|
57 |
#' v2 = c(1.1,2,3,4,5,6) |
|
58 |
#' ) |
|
59 |
#' |
|
60 |
#' diffdf(DF1 , DF2 , keys = "id") |
|
61 |
#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2) |
|
62 |
#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2) |
|
63 |
#' |
|
64 |
#' # We can use strict_factor to compare factors with characters for example: |
|
65 |
#' |
|
66 |
#' DF1 <- data.frame( |
|
67 |
#' id = c(1,2,3,4,5,6), |
|
68 |
#' v1 = letters[1:6], |
|
69 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA), |
|
70 |
#' stringsAsFactors = FALSE |
|
71 |
#' ) |
|
72 |
#' |
|
73 |
#' DF2 <- data.frame( |
|
74 |
#' id = c(1,2,3,4,5,6), |
|
75 |
#' v1 = letters[1:6], |
|
76 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA) |
|
77 |
#' ) |
|
78 |
#' |
|
79 |
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE) |
|
80 |
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE) |
|
81 |
#' |
|
82 |
#' @export |
|
83 |
diffdf <- function( |
|
84 |
base, |
|
85 |
compare, |
|
86 |
keys = NULL, |
|
87 |
suppress_warnings = FALSE, |
|
88 |
strict_numeric = TRUE, |
|
89 |
strict_factor = TRUE, |
|
90 |
file = NULL, |
|
91 |
tolerance = sqrt(.Machine$double.eps), |
|
92 |
scale = NULL |
|
93 |
) { |
|
94 | ||
95 | 136x |
BASE <- base |
96 | 136x |
COMP <- compare |
97 | 136x |
KEYS <- keys |
98 | 136x |
SUPWARN <- suppress_warnings |
99 | ||
100 | ||
101 |
### Initatiate output object |
|
102 | 136x |
COMPARE <- list() |
103 | 136x |
class(COMPARE) <- c("diffdf", "list") |
104 | ||
105 | 136x |
is_derived <- FALSE |
106 | ||
107 |
### If no key is suplied match values based upon row number |
|
108 | 136x |
if (is.null(KEYS)) { |
109 | 122x |
is_derived <- TRUE |
110 | 122x |
keyname <- generate_keyname(BASE, COMP) |
111 | 122x |
BASE[[keyname]] <- seq_len(nrow(BASE)) |
112 | 122x |
COMP[[keyname]] <- seq_len(nrow(COMP)) |
113 | 122x |
KEYS <- keyname |
114 |
} |
|
115 | 136x |
attr(COMPARE, "keys") <- list(value = KEYS, is_derived = is_derived) |
116 | ||
117 | ||
118 | ||
119 | 136x |
if (!is.numeric(tolerance)) { |
120 | 2x |
stop("'tolerance' should be numeric") |
121 |
} |
|
122 | ||
123 | 134x |
if (!is.numeric(scale) && !is.null(scale)) { |
124 | 2x |
stop("'scale' should be numeric or NULL") |
125 |
} |
|
126 | ||
127 | ||
128 | ||
129 | 132x |
if (!has_unique_rows(BASE, KEYS)) { |
130 | 1x |
stop("BY variables in BASE do not result in unique observations") |
131 |
} |
|
132 | ||
133 | 131x |
if (!has_unique_rows(COMP, KEYS)) { |
134 | ! |
stop("BY variables in COMPARE do not result in unique observations") |
135 |
} |
|
136 | ||
137 | ||
138 | ||
139 |
#### Check essential variable properties (class & mode) |
|
140 | ||
141 | 131x |
COMPARE[["UnsupportedColsBase"]] <- construct_issue( |
142 | 131x |
value = identify_unsupported_cols(BASE), |
143 | 131x |
message = "There are columns in BASE with unsupported modes !!" |
144 |
) |
|
145 | ||
146 | ||
147 | 131x |
COMPARE[["UnsupportedColsComp"]] <- construct_issue( |
148 | 131x |
value = identify_unsupported_cols(COMP), |
149 | 131x |
message = "There are columns in COMPARE with unsupported modes !!" |
150 |
) |
|
151 | ||
152 | ||
153 |
# cast variables if strict is off |
|
154 | 131x |
if (!strict_factor || !strict_numeric) { |
155 | ||
156 | 9x |
casted_df <- cast_variables( |
157 | 9x |
BASE = BASE, |
158 | 9x |
COMPARE = COMP, |
159 | 9x |
ignore_vars = KEYS, |
160 | 9x |
cast_integers = !strict_numeric, |
161 | 9x |
cast_factors = !strict_factor |
162 |
) |
|
163 | ||
164 | 9x |
BASE <- casted_df$BASE |
165 | 9x |
COMP <- casted_df$COMP |
166 | ||
167 |
} |
|
168 | ||
169 | ||
170 | 131x |
COMPARE[["VarModeDiffs"]] <- construct_issue( |
171 | 131x |
value = identify_mode_differences(BASE, COMP), |
172 | 131x |
message = "There are columns in BASE and COMPARE with different modes !!" |
173 |
) |
|
174 | ||
175 | ||
176 | 131x |
COMPARE[["VarClassDiffs"]] <- construct_issue( |
177 | 131x |
value = identify_class_differences(BASE, COMP), |
178 | 131x |
message = "There are columns in BASE and COMPARE with different classes !!" |
179 |
) |
|
180 | ||
181 | ||
182 | 131x |
exclude_cols <- c( |
183 | 131x |
COMPARE[["UnsupportedColsBase"]]$VARIABLE, |
184 | 131x |
COMPARE[["UnsupportedColsComp"]]$VARIABLE, |
185 | 131x |
COMPARE[["VarClassDiffs"]]$VARIABLE, |
186 | 131x |
COMPARE[["VarModeDiffs"]]$VARIABLE |
187 |
) |
|
188 | ||
189 | ||
190 |
##### Check Validity of Keys |
|
191 | ||
192 | 131x |
BASE_keys <- names(BASE)[names(BASE) %in% KEYS] |
193 | 131x |
COMP_keys <- names(COMP)[names(COMP) %in% KEYS] |
194 | ||
195 | ||
196 | 131x |
if (length(BASE_keys) != length(KEYS)) { |
197 | ! |
stop("BASE is missing variables specified in KEYS") |
198 |
} |
|
199 | ||
200 | 131x |
if (length(COMP_keys) != length(KEYS)) { |
201 | ! |
stop("COMP is missing variables specified in KEYS") |
202 |
} |
|
203 | ||
204 | 131x |
if (any(KEYS %in% exclude_cols)) { |
205 | ! |
stop("KEYS are either an invalid or contain different modes between BASE and COMP") |
206 |
} |
|
207 | ||
208 | ||
209 |
##### Check Attributes |
|
210 | ||
211 | ||
212 | 131x |
COMPARE[["AttribDiffs"]] <- construct_issue( |
213 | 131x |
value = identify_att_differences(BASE, COMP, exclude_cols), |
214 | 131x |
message = "There are columns in BASE and COMPARE with differing attributes !!" |
215 |
) |
|
216 | ||
217 | ||
218 |
##### Check data |
|
219 | ||
220 | 131x |
BASE <- factor_to_character(BASE, KEYS) |
221 | 131x |
COMP <- factor_to_character(COMP, KEYS) |
222 | ||
223 | ||
224 | 131x |
COMPARE[["ExtRowsBase"]] <- construct_issue( |
225 | 131x |
value = identify_extra_rows(BASE, COMP, KEYS), |
226 | 131x |
message = "There are rows in BASE that are not in COMPARE !!" |
227 |
) |
|
228 | ||
229 | ||
230 | 131x |
COMPARE[["ExtRowsComp"]] <- construct_issue( |
231 | 131x |
value = identify_extra_rows(COMP, BASE, KEYS), |
232 | 131x |
message = "There are rows in COMPARE that are not in BASE !!" |
233 |
) |
|
234 | ||
235 | ||
236 | ||
237 | 131x |
COMPARE[["ExtColsBase"]] <- construct_issue( |
238 | 131x |
value = identify_extra_cols(BASE, COMP), |
239 | 131x |
message = "There are columns in BASE that are not in COMPARE !!" |
240 |
) |
|
241 | ||
242 | ||
243 | 131x |
COMPARE[["ExtColsComp"]] <- construct_issue( |
244 | 131x |
value = identify_extra_cols(COMP, BASE), |
245 | 131x |
message = "There are columns in COMPARE that are not in BASE !!" |
246 |
) |
|
247 | ||
248 | ||
249 | 131x |
VALUE_DIFFERENCES <- identify_differences( |
250 | 131x |
BASE, COMP, KEYS, exclude_cols, |
251 | 131x |
tolerance = tolerance, |
252 | 131x |
scale = scale |
253 |
) |
|
254 | ||
255 | ||
256 | ||
257 |
## Summarise the number of mismatching rows per variable |
|
258 | ||
259 | 131x |
if (length(VALUE_DIFFERENCES)) { |
260 | 130x |
NDIFF <- sapply(VALUE_DIFFERENCES, nrow) |
261 | 130x |
COMPARE[["NumDiff"]] <- construct_issue( |
262 | 130x |
value = convert_to_issue(NDIFF), |
263 | 130x |
message = "Not all Values Compared Equal" |
264 |
) |
|
265 |
} |
|
266 | ||
267 | ||
268 | 131x |
for (i in names(VALUE_DIFFERENCES)){ |
269 | 993x |
COMPARE[[paste0("VarDiff_", i)]] <- construct_issue( |
270 | 993x |
value = VALUE_DIFFERENCES[[i]], |
271 | 993x |
message = "" |
272 |
) |
|
273 |
} |
|
274 | ||
275 |
## Get all issue messages, remove blank message, and collapse into single string |
|
276 | 131x |
ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x)) |
277 | 131x |
ISSUE_MSGS <- ISSUE_MSGS[ISSUE_MSGS != ""] |
278 | ||
279 | 131x |
if (length(ISSUE_MSGS) != 0) { |
280 | 97x |
if (!SUPWARN) { |
281 | 36x |
ISSUE_MSGS <- paste(ISSUE_MSGS, collapse = "\n") |
282 | 36x |
warning(c("\n", ISSUE_MSGS)) |
283 |
} |
|
284 |
} |
|
285 | ||
286 | ||
287 | 131x |
if (!is.null(file)) { |
288 | ! |
x <- print(COMPARE, as_string = TRUE) |
289 | ||
290 | ! |
tryCatch( |
291 |
{ |
|
292 | ! |
sink(file) |
293 | ! |
cat(x, sep = "\n") |
294 | ! |
sink() |
295 |
}, |
|
296 | ! |
warning = function(w) { |
297 | ! |
sink() |
298 | ! |
warning(w) |
299 |
}, |
|
300 | ! |
error = function(e) { |
301 | ! |
sink() |
302 | ! |
stop(e) |
303 |
} |
|
304 |
) |
|
305 | ! |
return(invisible(COMPARE)) |
306 |
} |
|
307 | ||
308 | 131x |
return(COMPARE) |
309 |
} |
|
310 | ||
311 | ||
312 | ||
313 | ||
314 |
#' diffdf_has_issues |
|
315 |
#' |
|
316 |
#' Utility function which returns TRUE if an diffdf |
|
317 |
#' object has issues or FALSE if an diffdf object does not have issues |
|
318 |
#' @param x diffdf object |
|
319 |
#' @examples |
|
320 |
#' |
|
321 |
#' # Example with no issues |
|
322 |
#' x <- diffdf( iris, iris ) |
|
323 |
#' diffdf_has_issues(x) |
|
324 |
#' |
|
325 |
#' # Example with issues |
|
326 |
#' iris2 <- iris |
|
327 |
#' iris2[2,2] <- NA |
|
328 |
#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE) |
|
329 |
#' diffdf_has_issues(x) |
|
330 |
#' @export |
|
331 |
diffdf_has_issues <- function(x) { |
|
332 | ! |
if (class(x)[[1]] != "diffdf") stop("x is not an diffdf object") |
333 | 21x |
return(length(x) != 0) |
334 |
} |
1 | ||
2 | ||
3 |
#' Pad String |
|
4 |
#' |
|
5 |
#' Utility function used to replicate `str_pad`. Adds white space to either end |
|
6 |
#' of a string to get it to equal the desired length |
|
7 |
#' @param x string |
|
8 |
#' @param width desired length |
|
9 |
string_pad <- function(x, width) { |
|
10 | ! |
if (nchar(x) >= width) return(x) |
11 | 1114x |
width <- width - nchar(x) |
12 | 1114x |
left <- paste0(rep(" ", floor(width / 2)), collapse = "") |
13 | 1114x |
right <- paste0(rep(" ", ceiling(width / 2)), collapse = "") |
14 | 1114x |
paste0(left, x, right, collapse = "") |
15 |
} |
|
16 | ||
17 | ||
18 |
#' recursive_reduce |
|
19 |
#' |
|
20 |
#' Utility function used to replicated `purrr::reduce`. Recursively applies a |
|
21 |
#' function to a list of elements until only 1 element remains |
|
22 |
#' @param .l list of values to apply a function to |
|
23 |
#' @param .f function to apply to each each element of the list in turn. See details. |
|
24 |
#' @details |
|
25 |
#' This function is essentially performing the following operation: |
|
26 |
#' ``` |
|
27 |
#' .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]]) |
|
28 |
#' ``` |
|
29 |
recursive_reduce <- function(.l, .f) { |
|
30 | 629x |
if (length(.l) != 1) { |
31 | 425x |
.l[[2]] <- .f(.l[[1]], .l[[2]]) |
32 | 425x |
return(recursive_reduce(.l[-1], .f)) |
33 |
} else { |
|
34 | 204x |
return(.l[[1]]) |
35 |
} |
|
36 |
} |
|
37 | ||
38 |
#' invert |
|
39 |
#' |
|
40 |
#' Utility function used to replicated `purrr::transpose`. Turns a list inside |
|
41 |
#' out. |
|
42 |
#' @param x list |
|
43 |
invert <- function(x) { |
|
44 | 64x |
x2 <- list() |
45 | 64x |
cnames <- names(x) |
46 | 64x |
tnames <- names(x[[1]]) |
47 | 64x |
for (i in tnames){ |
48 | 192x |
x2[[i]] <- list() |
49 | 192x |
for (j in cnames){ |
50 | 609x |
x2[[i]][[j]] <- x[[j]][[i]] |
51 |
} |
|
52 |
} |
|
53 | 64x |
return(x2) |
54 |
} |
|
55 | ||
56 | ||
57 | ||
58 | ||
59 |
#' as_ascii_table |
|
60 |
#' |
|
61 |
#' This function takes a `data.frame` and attempts to convert it into |
|
62 |
#' a simple ascii format suitable for printing to the screen |
|
63 |
#' It is assumed all variable values have a `as.character()` method |
|
64 |
#' in order to cast them to character. |
|
65 |
#' @param dat Input dataset to convert into a ascii table |
|
66 |
#' @param line_prefix Symbols to prefix in front of every line of the table |
|
67 |
as_ascii_table <- function(dat, line_prefix = " ") { |
|
68 | ||
69 | ||
70 |
## Convert every value to character and crop to a suitable length |
|
71 | 64x |
dat <- as_tibble(apply(dat, c(1, 2), as_cropped_char)) |
72 | ||
73 | 64x |
hold <- list() |
74 | 64x |
COLS <- colnames(dat) |
75 | ||
76 |
### For each column extract core elements (width, values , title) and pad out |
|
77 |
### each string to be a suitable length |
|
78 | 64x |
for (i in seq_len(ncol(dat))) { |
79 | 203x |
COL <- COLS[i] |
80 | 203x |
VALUES <- dat[[i]] |
81 | ||
82 | 203x |
JOINT <- c(COL, VALUES) |
83 | 203x |
WIDTH <- max(sapply(JOINT, nchar)) + 2 |
84 | ||
85 | 203x |
hold[[COL]] <- list() |
86 | 203x |
hold[[COL]]$WIDTH <- WIDTH |
87 | 203x |
hold[[COL]]$VALUES <- sapply(VALUES, string_pad, width = WIDTH) |
88 | 203x |
hold[[COL]]$HEADER <- sapply(COL, string_pad, width = WIDTH) |
89 |
} |
|
90 | ||
91 |
### Collapse into a single value per component ( title , values, width ) |
|
92 | 64x |
thold <- invert(hold) |
93 | 64x |
tvals <- recursive_reduce(thold$VALUES, paste0) |
94 | 64x |
thead <- recursive_reduce(thold$HEADER, paste0) |
95 | 64x |
twidth <- recursive_reduce(thold$WIDTH, sum) |
96 | ||
97 |
### Create header and footer lines |
|
98 | 64x |
TLINE <- paste0(rep("=", twidth), collapse = "") |
99 | 64x |
LINE <- paste0(rep("-", twidth), collapse = "") |
100 | 64x |
FVALS <- paste0(line_prefix, tvals, collapse = "\n") |
101 | ||
102 |
### Output table |
|
103 | 64x |
paste0( |
104 | 64x |
"\n", |
105 | 64x |
line_prefix, TLINE, "\n", |
106 | 64x |
line_prefix, thead, "\n", |
107 | 64x |
line_prefix, LINE, "\n", |
108 | 64x |
FVALS, "\n", |
109 | 64x |
line_prefix, LINE |
110 |
) |
|
111 |
} |
|
112 | ||
113 | ||
114 | ||
115 | ||
116 | ||
117 | ||
118 | ||
119 | ||
120 |
#' as_cropped_char |
|
121 |
#' |
|
122 |
#' Makes any character string above x chars |
|
123 |
#' Reduce down to a x char string with ... |
|
124 |
#' @param inval a single element value |
|
125 |
#' @param crop_at character limit |
|
126 |
as_cropped_char <- function(inval, crop_at = 30) { |
|
127 | ||
128 | 911x |
if (is.null(inval)) { |
129 | ! |
inval <- "<NULL>" |
130 | 911x |
} else if (is.na(inval)) { |
131 | 1x |
inval <- "<NA>" |
132 |
} else { |
|
133 | 910x |
inval <- as.character(inval) |
134 |
} |
|
135 | ||
136 | 911x |
charlength <- sapply(inval, nchar) |
137 | ||
138 | 911x |
if (charlength > crop_at) { |
139 | 10x |
outval <- substr(inval, 1, crop_at) |
140 | 10x |
outval <- paste0(outval, "...") |
141 |
} else { |
|
142 | 901x |
outval <- inval |
143 |
} |
|
144 | ||
145 | 911x |
outval |
146 |
} |
|
147 | ||
148 | ||
149 | ||
150 |
#' get_table |
|
151 |
#' |
|
152 |
#' Generate nice looking table from a data frame |
|
153 |
#' @param dsin dataset |
|
154 |
#' @inheritParams print.diffdf |
|
155 |
get_table <- function(dsin, row_limit = 10) { |
|
156 | ||
157 | 64x |
if (nrow(dsin) == 0) { |
158 | ! |
return("") |
159 |
} |
|
160 | 64x |
if (!is.null(row_limit)) { |
161 | 62x |
if (length(row_limit) != 1) { |
162 | ! |
stop("row_limit should have a length of 1") |
163 |
} |
|
164 | 62x |
if (!is.numeric(row_limit)) { |
165 | ! |
stop("row_limit should be a numeric value or NULL") |
166 |
} |
|
167 | 62x |
if (row_limit <= 0) { |
168 | ! |
stop("row_limit should be a positive integer") |
169 |
} |
|
170 |
} |
|
171 | 64x |
if (is.null(row_limit)) { |
172 | 2x |
display_table <- dsin |
173 |
} else { |
|
174 | 62x |
display_table <- subset(dsin, seq_len(nrow(dsin)) < (row_limit + 1)) |
175 |
} |
|
176 | ||
177 | 64x |
if (!is.null(row_limit) && nrow(dsin) > row_limit) { |
178 | 6x |
add_message <- paste0( |
179 | 6x |
"First ", |
180 | 6x |
row_limit, |
181 | 6x |
" of ", |
182 | 6x |
nrow(dsin), |
183 | 6x |
" rows are shown in table below" |
184 |
) |
|
185 | ||
186 |
} else { |
|
187 | 58x |
add_message <- "All rows are shown in table below" |
188 |
} |
|
189 | ||
190 | 64x |
msg <- paste( |
191 | 64x |
c( |
192 | 64x |
add_message, |
193 | 64x |
as_ascii_table(display_table), |
194 | 64x |
"\n" |
195 |
), |
|
196 | 64x |
collapse = "\n" |
197 |
) |
|
198 | 64x |
return(msg) |
199 |
} |
1 | ||
2 |
#' sort_then_join |
|
3 |
#' |
|
4 |
#' Convenience function to sort two strings and paste them together |
|
5 |
#' @param string1 first string |
|
6 |
#' @param string2 second string |
|
7 |
sort_then_join <- function(string1, string2) { |
|
8 | 14x |
paste0(sort(c(string1, string2)), collapse = "") |
9 |
} |
|
10 | ||
11 | ||
12 |
#' class_merge |
|
13 |
#' |
|
14 |
#' Convenience function to put all classes an object has into one string |
|
15 |
#' @param x an object |
|
16 |
class_merge <- function(x) { |
|
17 | 304x |
paste(class(x), collapse = "_") |
18 |
} |
|
19 | ||
20 | ||
21 |
get_message <- function(colname, whichdat, totype) { |
|
22 | 12x |
message(paste0( |
23 | 12x |
"NOTE: Variable ", colname, " in ", tolower(whichdat), " was casted to ", totype |
24 |
)) |
|
25 |
} |
|
26 | ||
27 | ||
28 |
#' get_casted_vector |
|
29 |
#' |
|
30 |
#' casts a vector depending on its type and input |
|
31 |
#' @param colin column to cast |
|
32 |
#' @param colname name of vector |
|
33 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
34 |
get_casted_vector <- function(colin, colname, whichdat) { |
|
35 | ||
36 | 24x |
if (class(colin) == "factor") { |
37 | 6x |
get_message(colname, whichdat, "character") |
38 | 6x |
return(as.character(colin)) |
39 |
} |
|
40 | ||
41 | 18x |
if (class(colin) == "integer") { |
42 | 6x |
get_message(colname, whichdat, "numeric") |
43 | 6x |
return(as.numeric(colin)) |
44 |
} |
|
45 | ||
46 | 12x |
colin |
47 |
} |
|
48 | ||
49 | ||
50 | ||
51 |
#' get_casted_dataset |
|
52 |
#' |
|
53 |
#' Internal utility function to loop across a dataset casting all target |
|
54 |
#' variables |
|
55 |
#' @param df dataset to be casted |
|
56 |
#' @param columns columns to be casted |
|
57 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
58 |
get_casted_dataset <- function(df, columns, whichdat) { |
|
59 | 22x |
for (col in columns) { |
60 | 24x |
df[[col]] <- get_casted_vector(df[[col]], col, whichdat) |
61 |
} |
|
62 | 22x |
return(df) |
63 |
} |
|
64 | ||
65 | ||
66 | ||
67 | ||
68 | ||
69 | ||
70 |
#' cast_variables |
|
71 |
#' |
|
72 |
#' Function to cast datasets columns if they have differing types |
|
73 |
#' Restricted to specific cases, currently integer and double, and character and factor |
|
74 |
#' @param BASE base dataset |
|
75 |
#' @param COMPARE comparison dataset |
|
76 |
#' @param ignore_vars Variables not to be considered for casting |
|
77 |
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles |
|
78 |
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters |
|
79 |
cast_variables <- function( |
|
80 |
BASE, |
|
81 |
COMPARE, |
|
82 |
ignore_vars = NULL, |
|
83 |
cast_integers = FALSE, |
|
84 |
cast_factors = FALSE |
|
85 |
) { |
|
86 | ||
87 | 13x |
allowed_class_casts <- c("integernumeric", "characterfactor")[c(cast_integers, cast_factors)] |
88 | ||
89 | 13x |
BASE_class <- data.frame( |
90 | 13x |
class_BASE = sapply(BASE, class_merge), |
91 | 13x |
colname = names(BASE), |
92 | 13x |
stringsAsFactors = FALSE |
93 |
) |
|
94 | 13x |
BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars, , drop = FALSE] |
95 | ||
96 | ||
97 | 13x |
COMPARE_class <- data.frame( |
98 | 13x |
class_COMPARE = sapply(COMPARE, class_merge), |
99 | 13x |
colname = names(COMPARE), |
100 | 13x |
stringsAsFactors = FALSE |
101 |
) |
|
102 | 13x |
COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars, , drop = FALSE] |
103 | ||
104 | 13x |
common_class <- merge( |
105 | 13x |
x = BASE_class, |
106 | 13x |
y = COMPARE_class, |
107 | 13x |
by = "colname" |
108 |
) |
|
109 | ||
110 | ||
111 | 13x |
diff_class <- common_class[common_class[["class_BASE"]] != common_class[["class_COMPARE"]], , drop = FALSE] |
112 | ||
113 | ||
114 | 13x |
diff_class$classmerge <- mapply( |
115 | 13x |
sort_then_join, |
116 | 13x |
diff_class$class_COMPARE, |
117 | 13x |
diff_class$class_BASE |
118 |
) |
|
119 | ||
120 | ||
121 | 13x |
cast_columns <- diff_class[diff_class[["classmerge"]] %in% allowed_class_casts, , drop = FALSE] |
122 | ||
123 | ||
124 | 13x |
DATASETS <- list( |
125 | 13x |
"BASE" = BASE, |
126 | 13x |
"COMPARE" = COMPARE |
127 |
) |
|
128 | ||
129 | ||
130 | 13x |
if (nrow(cast_columns) == 0) { |
131 | 2x |
return(DATASETS) |
132 |
} |
|
133 | ||
134 | ||
135 | 11x |
for (i in names(DATASETS)){ |
136 | 22x |
DATASETS[[i]] <- get_casted_dataset(DATASETS[[i]], cast_columns$colname, i) |
137 |
} |
|
138 | ||
139 | 11x |
return(DATASETS) |
140 |
} |
1 | ||
2 | ||
3 |
#' identify_extra_rows |
|
4 |
#' |
|
5 |
#' Identifies rows that are in a baseline dataset but not in a comparator dataset |
|
6 |
#' @param DS1 Baseline dataset (data frame) |
|
7 |
#' @param DS2 Comparator dataset (data frame) |
|
8 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
9 |
identify_extra_rows <- function(DS1, DS2, KEYS) { |
|
10 | 262x |
DS2[["..FLAG.."]] <- "Y" |
11 | 262x |
dat <- merge( |
12 | 262x |
subset(DS1, select = KEYS), |
13 | 262x |
subset(DS2, select = c(KEYS, "..FLAG..")), |
14 | 262x |
by = KEYS, all.x = TRUE, |
15 | 262x |
sort = TRUE |
16 |
) |
|
17 | 262x |
dat <- dat[do.call("order", dat[KEYS]), ] |
18 | ||
19 | 262x |
dat[is.na(dat[["..FLAG.."]]), KEYS, drop = FALSE] |
20 |
} |
|
21 | ||
22 | ||
23 | ||
24 |
#' identify_extra_cols |
|
25 |
#' |
|
26 |
#' Identifies columns that are in a baseline dataset but not in a comparator dataset |
|
27 |
#' @param DS1 Baseline dataset (data frame) |
|
28 |
#' @param DS2 Comparator dataset (data frame) |
|
29 |
#' @importFrom tibble tibble |
|
30 |
identify_extra_cols <- function(DS1, DS2) { |
|
31 | 262x |
match.cols <- sapply(names(DS1), "%in%", names(DS2)) |
32 | 262x |
if (!all(is.logical(match.cols))) { |
33 | ! |
stop("Assumption of logical return type is not true") |
34 |
} |
|
35 | 262x |
tibble( |
36 | 262x |
COLUMNS = names(DS1)[!match.cols] |
37 |
) |
|
38 |
} |
|
39 | ||
40 | ||
41 | ||
42 | ||
43 | ||
44 | ||
45 | ||
46 |
#' identify_matching_cols |
|
47 |
#' |
|
48 |
#' Identifies columns with the same name in two data frames |
|
49 |
#' @param DS1 Input dataset 1 (data frame) |
|
50 |
#' @param DS2 Input dataset 2 (data frame) |
|
51 |
#' @param EXCLUDE Columns to ignore |
|
52 |
identify_matching_cols <- function(DS1, DS2, EXCLUDE = "") { |
|
53 | 834x |
match_cols <- sapply(names(DS1), "%in%", names(DS2)) |
54 | 834x |
exclude_cols <- sapply(names(DS1), "%in%", EXCLUDE) |
55 | 834x |
names(DS1)[match_cols & !exclude_cols] |
56 |
} |
|
57 | ||
58 | ||
59 | ||
60 | ||
61 | ||
62 |
#' identify_unsupported_cols |
|
63 |
#' |
|
64 |
#' Identifies any columns for which the package is not setup to handle |
|
65 |
#' @param dsin input dataset |
|
66 |
identify_unsupported_cols <- function(dsin) { |
|
67 | 262x |
dat <- subset( |
68 | 262x |
identify_properties(dsin), |
69 | 262x |
select = c("VARIABLE", "MODE") |
70 |
) |
|
71 | ||
72 | 262x |
dat[!dat[["MODE"]] %in% c("numeric", "character", "logical"), , drop = FALSE] |
73 | ||
74 |
} |
|
75 | ||
76 | ||
77 | ||
78 |
#' identify_mode_differences |
|
79 |
#' |
|
80 |
#' Identifies any mode differences between two data frames |
|
81 |
#' @param BASE Base dataset for comparison (data.frame) |
|
82 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
83 |
identify_mode_differences <- function(BASE, COMP) { |
|
84 | ||
85 | 431x |
matching_cols <- identify_matching_cols(BASE, COMP) |
86 | ||
87 | 431x |
dat <- merge( |
88 | 431x |
x = identify_properties(BASE), |
89 | 431x |
y = identify_properties(COMP), |
90 | 431x |
by = "VARIABLE", |
91 | 431x |
all = TRUE, |
92 | 431x |
suffixes = c(".BASE", ".COMP"), |
93 | 431x |
sort = TRUE |
94 |
) |
|
95 | 431x |
dat <- subset(dat, select = c("VARIABLE", "MODE.BASE", "MODE.COMP")) |
96 | ||
97 | 431x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
98 | 431x |
KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]] |
99 | ||
100 | 431x |
dat[KEEP1 & KEEP2, , drop = FALSE] |
101 | ||
102 |
} |
|
103 | ||
104 | ||
105 | ||
106 |
#' identify_class_differences |
|
107 |
#' |
|
108 |
#' Identifies any class differences between two data frames |
|
109 |
#' @param BASE Base dataset for comparison (data.frame) |
|
110 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
111 |
identify_class_differences <- function(BASE, COMP) { |
|
112 | ||
113 | 131x |
matching_cols <- identify_matching_cols(BASE, COMP) |
114 | ||
115 | 131x |
dat <- merge( |
116 | 131x |
x = identify_properties(BASE), |
117 | 131x |
y = identify_properties(COMP), |
118 | 131x |
by = "VARIABLE", |
119 | 131x |
all = TRUE, |
120 | 131x |
sort = TRUE, |
121 | 131x |
suffixes = c(".BASE", ".COMP") |
122 |
) |
|
123 | ||
124 | 131x |
dat <- subset(dat, select = c("VARIABLE", "CLASS.BASE", "CLASS.COMP")) |
125 | ||
126 | 131x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
127 | 131x |
KEEP2 <- !mapply( |
128 | 131x |
identical, |
129 | 131x |
dat[["CLASS.BASE"]], |
130 | 131x |
dat[["CLASS.COMP"]] |
131 |
) |
|
132 | ||
133 | 131x |
dat[KEEP1 & KEEP2, , drop = FALSE] |
134 | ||
135 |
} |
|
136 | ||
137 | ||
138 | ||
139 |
#' Identify differences in attributes |
|
140 |
#' |
|
141 |
#' Identifies any attribute differences between two data frames |
|
142 |
#' @param BASE Base dataset for comparison (data.frame) |
|
143 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
144 |
#' @param exclude_cols Columns to exclude from comparison |
|
145 |
#' @importFrom tibble tibble |
|
146 |
identify_att_differences <- function(BASE, COMP, exclude_cols = "") { |
|
147 | ||
148 | 141x |
matching_cols <- identify_matching_cols(BASE, COMP, exclude_cols) |
149 | ||
150 | 141x |
PROPS <- merge( |
151 | 141x |
x = identify_properties(BASE), |
152 | 141x |
y = identify_properties(COMP), |
153 | 141x |
by = "VARIABLE", |
154 | 141x |
all = TRUE, |
155 | 141x |
sort = TRUE, |
156 | 141x |
suffixes = c(".BASE", ".COMP") |
157 |
) |
|
158 | ||
159 | 141x |
PROPS <- subset(PROPS, select = c("VARIABLE", "ATTRIBS.BASE", "ATTRIBS.COMP")) |
160 | ||
161 | 141x |
PROPS <- PROPS[PROPS[["VARIABLE"]] %in% matching_cols, , drop = FALSE] |
162 | ||
163 | ||
164 |
### Setup dummy return value |
|
165 | 141x |
RETURN <- tibble( |
166 | 141x |
VARIABLE = character(), |
167 | 141x |
ATTR_NAME = character(), |
168 | 141x |
VALUES.BASE = list(), |
169 | 141x |
VALUES.COMP = list() |
170 |
) |
|
171 | ||
172 | 141x |
for (i in PROPS[["VARIABLE"]]) { |
173 | ||
174 | 1241x |
PROPS_filt <- PROPS[PROPS[["VARIABLE"]] == i, , drop = FALSE] |
175 | ||
176 |
### Get a vector of all available attributes across both variables |
|
177 | 1241x |
ATTRIB_NAMES <- unique(c( |
178 | 1241x |
names(PROPS_filt[["ATTRIBS.BASE"]][[1]]), |
179 | 1241x |
names(PROPS_filt[["ATTRIBS.COMP"]][[1]]) |
180 |
)) |
|
181 | ||
182 |
### If variable has no attributes move onto the next variable |
|
183 | 908x |
if (is.null(ATTRIB_NAMES)) next() |
184 | ||
185 |
### Loop over each attribute checking if they are identical and outputing |
|
186 |
### anyones that arn't |
|
187 | 333x |
for (j in ATTRIB_NAMES){ |
188 | ||
189 | 574x |
ATTRIB_BASE <- PROPS_filt[["ATTRIBS.BASE"]][[1]][j] |
190 | 574x |
ATTRIB_COMP <- PROPS_filt[["ATTRIBS.COMP"]][[1]][j] |
191 | ||
192 | 574x |
if (!identical(ATTRIB_BASE, ATTRIB_COMP)) { |
193 | ||
194 | 89x |
ATT_DIFFS <- tibble( |
195 | 89x |
VARIABLE = i, |
196 | 89x |
ATTR_NAME = j, |
197 | 89x |
VALUES.BASE = ifelse(is.null(ATTRIB_BASE), list(), ATTRIB_BASE), |
198 | 89x |
VALUES.COMP = ifelse(is.null(ATTRIB_COMP), list(), ATTRIB_COMP) |
199 |
) |
|
200 | ||
201 | 89x |
RETURN <- rbind(RETURN, ATT_DIFFS) |
202 |
} |
|
203 |
} |
|
204 |
} |
|
205 | 141x |
return(RETURN) |
206 |
} |
|
207 | ||
208 | ||
209 | ||
210 | ||
211 | ||
212 |
#' identify_differences |
|
213 |
#' |
|
214 |
#' Compares each column within 2 datasets to identify any values which they |
|
215 |
#' mismatch on. |
|
216 |
#' @param BASE Base dataset for comparison (data.frame) |
|
217 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
218 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
219 |
#' @param exclude_cols Columns to exclude from comparison |
|
220 |
#' @param tolerance Level of tolerance for numeric differences between two variables |
|
221 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
222 |
identify_differences <- function( |
|
223 |
BASE, |
|
224 |
COMP, |
|
225 |
KEYS, |
|
226 |
exclude_cols, |
|
227 |
tolerance = sqrt(.Machine$double.eps), |
|
228 |
scale = NULL |
|
229 |
) { |
|
230 | ||
231 | 131x |
matching_cols <- identify_matching_cols(BASE, COMP, c(KEYS, exclude_cols)) |
232 | ||
233 | 1x |
if (length(matching_cols) == 0) return(tibble()) |
234 | ||
235 | 130x |
DAT <- merge( |
236 | 130x |
x = BASE, |
237 | 130x |
y = COMP, |
238 | 130x |
by = KEYS, |
239 | 130x |
suffix = c(".x", ".y"), |
240 | 130x |
sort = TRUE |
241 |
) |
|
242 | 130x |
DAT <- DAT[do.call("order", DAT[KEYS]), ] |
243 | ||
244 | 130x |
matching_list <- mapply( |
245 | 130x |
is_variable_different, |
246 | 130x |
matching_cols, |
247 | 130x |
MoreArgs = list( |
248 | 130x |
keynames = KEYS, |
249 | 130x |
datain = DAT, |
250 | 130x |
tolerance = tolerance, |
251 | 130x |
scale = scale |
252 |
), |
|
253 | 130x |
SIMPLIFY = FALSE |
254 |
) |
|
255 | ||
256 | 130x |
matching_list |
257 |
} |
|
258 | ||
259 | ||
260 | ||
261 | ||
262 | ||
263 | ||
264 | ||
265 | ||
266 | ||
267 |
#' identify_properties |
|
268 |
#' |
|
269 |
#' Returns a dataframe of metadata for a given dataset. |
|
270 |
#' Returned values include variable names , class , mode , type & attributes |
|
271 |
#' @param dsin input dataframe that you want to get the metadata from |
|
272 |
#' @importFrom tibble tibble |
|
273 |
identify_properties <- function(dsin) { |
|
274 | ||
275 |
### If missing or null return empty dataset |
|
276 | 1668x |
if (is.null(dsin)) { |
277 | ! |
x <- tibble( |
278 | ! |
VARIABLE = character(), |
279 | ! |
CLASS = list(), |
280 | ! |
MODE = character(), |
281 | ! |
TYPE = character(), |
282 | ! |
ATTRIBS = list() |
283 |
) |
|
284 | ! |
return(x) |
285 |
} |
|
286 | ||
287 | 1668x |
tibble( |
288 | 1668x |
VARIABLE = names(dsin), |
289 | 1668x |
CLASS = lapply(dsin, class), |
290 | 1668x |
MODE = sapply(dsin, mode), |
291 | 1668x |
TYPE = sapply(dsin, typeof), |
292 | 1668x |
ATTRIBS = lapply(dsin, attributes) |
293 |
) |
|
294 |
} |
1 |
#' is_variable_different |
|
2 |
#' |
|
3 |
#' This subsets the data set on the variable name, picks out differences and returns a `tibble` |
|
4 |
#' of differences for the given variable |
|
5 |
#' @importFrom tibble as_tibble |
|
6 |
#' @param variablename name of variable being compared |
|
7 |
#' @param keynames name of keys |
|
8 |
#' @param datain Inputted dataset with base and compare vectors |
|
9 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
10 |
#' @return A boolean vector which is T if target and current are different |
|
11 |
is_variable_different <- function(variablename, keynames, datain, ...) { |
|
12 | ||
13 | 993x |
xvar <- paste0(variablename, ".x") |
14 | 993x |
yvar <- paste0(variablename, ".y") |
15 | ||
16 | 993x |
if (!xvar %in% names(datain) || ! yvar %in% names(datain)) { |
17 | ! |
stop("Variable does not exist within input dataset") |
18 |
} |
|
19 | ||
20 | 993x |
target <- datain[[xvar]] |
21 | 993x |
current <- datain[[yvar]] |
22 | 993x |
outvect <- find_difference(target, current, ...) |
23 | ||
24 | 993x |
datain[["VARIABLE"]] <- variablename |
25 | ||
26 | 993x |
names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE") |
27 | ||
28 | 993x |
x <- as_tibble( |
29 | 993x |
subset( |
30 | 993x |
datain, |
31 | 993x |
outvect, |
32 | 993x |
select = c("VARIABLE", keynames, "BASE", "COMPARE") |
33 |
) |
|
34 |
) |
|
35 | ||
36 | 993x |
return(x) |
37 |
} |
|
38 | ||
39 |
#' compare_vectors |
|
40 |
#' |
|
41 |
#' Compare two vectors looking for differences |
|
42 |
#' |
|
43 |
#' @param target the base vector |
|
44 |
#' @param current a vector to compare target to |
|
45 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
46 |
compare_vectors <- function(target, current, ...) { |
|
47 | 1026x |
UseMethod("compare_vectors") |
48 |
} |
|
49 | ||
50 | ||
51 |
#' find_difference |
|
52 |
#' |
|
53 |
#' This determines if two vectors are different. It expects vectors of the same |
|
54 |
#' length and type, and is intended to be used after checks have already been done |
|
55 |
#' Initially picks out any `NA`'s (matching `NA`'s count as a match) |
|
56 |
#' Then compares remaining vector |
|
57 |
#' |
|
58 |
#' @param target the base vector |
|
59 |
#' @param current a vector to compare target to |
|
60 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
61 |
find_difference <- function(target, current, ...) { |
|
62 | ||
63 | 1029x |
if (length(target) != length(current)) { |
64 | 2x |
warning("Inputs are not of the same length") |
65 | 2x |
return(NULL) |
66 |
} |
|
67 | ||
68 | 1027x |
if (is.null(target) || is.null(current)) { |
69 | 1x |
return(is.null(target) != is.null(current)) |
70 |
} |
|
71 | ||
72 |
### Initalise output, assume problem unless evidence otherwise |
|
73 | 1026x |
return_vector <- rep(TRUE, length(target)) |
74 | ||
75 | 1026x |
nas_t <- is.na(target) |
76 | 1026x |
nas_c <- is.na(current) |
77 | ||
78 |
## compare missing values |
|
79 | 1026x |
nacompare <- nas_t != nas_c |
80 | 1026x |
naselect <- nas_t | nas_c |
81 | 1026x |
return_vector[naselect] <- nacompare[naselect] |
82 | ||
83 |
## compare non-missing values |
|
84 | 1026x |
selectvector <- as.logical((!nas_t) * (!nas_c)) |
85 | ||
86 | 1026x |
comparevect <- compare_vectors( |
87 | 1026x |
target[selectvector], |
88 | 1026x |
current[selectvector], |
89 |
... |
|
90 |
) |
|
91 | ||
92 | 1026x |
return_vector[selectvector] <- comparevect |
93 | ||
94 | 1026x |
return(return_vector) |
95 |
} |
|
96 | ||
97 | ||
98 | ||
99 | ||
100 | ||
101 | ||
102 | ||
103 |
#' compare_vectors.default |
|
104 |
#' |
|
105 |
#' Default method, if the vector is not numeric or factor. Basic comparison |
|
106 |
#' @param target the base vector |
|
107 |
#' @param current a vector to compare target to |
|
108 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
109 |
compare_vectors.default <- function(target, current, ...) { |
|
110 | 445x |
target != current |
111 |
} |
|
112 | ||
113 | ||
114 | ||
115 | ||
116 |
#' compare_vectors.factor |
|
117 |
#' |
|
118 |
#' Compares factors. Sets them as character and then compares |
|
119 |
#' @param target the base vector |
|
120 |
#' @param current a vector to compare target to |
|
121 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
122 |
compare_vectors.factor <- function(target, current, ...) { |
|
123 | 90x |
as.character(target) != as.character(current) |
124 |
} |
|
125 | ||
126 | ||
127 | ||
128 | ||
129 | ||
130 |
#' compare_vectors.numeric |
|
131 |
#' |
|
132 |
#' This is a modified version of the all.equal function |
|
133 |
#' which returns a vector rather than a message |
|
134 |
#' @param target the base vector |
|
135 |
#' @param current a vector to compare target to |
|
136 |
#' @param tolerance Level of tolerance for differences between two variables |
|
137 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
138 |
compare_vectors.numeric <- function( |
|
139 |
target, |
|
140 |
current, |
|
141 |
tolerance = sqrt(.Machine$double.eps), |
|
142 |
scale = NULL |
|
143 |
) { |
|
144 | ||
145 | 491x |
out <- target == current |
146 | ||
147 | 491x |
if (all(out)) { |
148 | 451x |
return(!out) |
149 |
} |
|
150 | ||
151 | 40x |
if (is.integer(target) || is.integer(current)) { |
152 | 13x |
target <- as.double(target) |
153 | 13x |
current <- as.double(current) |
154 |
} |
|
155 | ||
156 | 40x |
xy <- abs(target - current) |
157 | ||
158 | 40x |
if (!is.null(scale)) { |
159 | 4x |
xy <- xy / scale |
160 |
} |
|
161 | ||
162 | 40x |
return(xy > tolerance) |
163 |
} |
|
164 | ||
165 |
#' compare_vectors.int64 |
|
166 |
#' |
|
167 |
#' Handle int64 vectors. Uses numeric comparison |
|
168 |
#' @param target the base vector |
|
169 |
#' @param current a vector to compare target to |
|
170 |
#' @param tolerance Level of tolerance for differences between two variables |
|
171 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
172 |
compare_vectors.integer64 <- function( |
|
173 |
target, |
|
174 |
current, |
|
175 |
tolerance = sqrt(.Machine$double.eps), |
|
176 |
scale = NULL |
|
177 |
) { |
|
178 | 4x |
compare_vectors.numeric(target, current, tolerance, scale) |
179 |
} |
1 | ||
2 | ||
3 | ||
4 | ||
5 | ||
6 |
#' Identify Issue Rows |
|
7 |
#' |
|
8 |
#' This function takes a `diffdf` object and a dataframe and subsets |
|
9 |
#' the `data.frame` for problem rows as identified in the comparison object. |
|
10 |
#' If \code{vars} has been specified only issue rows associated with those |
|
11 |
#' variable(s) will be returned. |
|
12 |
#' @param df dataframe to be subsetted |
|
13 |
#' @param diff diffdf object |
|
14 |
#' @param vars (optional) character vector containing names of issue variables to subset dataframe |
|
15 |
#' on. A value of NULL (default) will be taken to mean available issue variables. |
|
16 |
#' @examples |
|
17 |
#' iris2 <- iris |
|
18 |
#' for ( i in 1:3) iris2[i,i] <- 99 |
|
19 |
#' x <- diffdf( iris , iris2, suppress_warnings = TRUE) |
|
20 |
#' diffdf_issuerows( iris , x) |
|
21 |
#' diffdf_issuerows( iris2 , x) |
|
22 |
#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length") |
|
23 |
#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width")) |
|
24 |
#' @details |
|
25 |
#' Note that `diffdf_issuerows` can be used to subset against any dataframe. The only |
|
26 |
#' requirement is that the original variables specified in the keys argument to diffdf |
|
27 |
#' are present on the dataframe you are subsetting against. However please note that if |
|
28 |
#' no keys were specified in diffdf then the row number is used. This means using |
|
29 |
#' `diffdf_issuerows` without a keys against an arbitrary dataset can easily result in |
|
30 |
#' nonsense rows being returned. It is always recommended to supply keys to diffdf. |
|
31 |
#' @export |
|
32 |
diffdf_issuerows <- function(df, diff, vars = NULL) { |
|
33 | ||
34 | 19x |
if (class(diff)[[1]] != "diffdf") { |
35 | ! |
stop("diff should be an diffdf object") |
36 |
} |
|
37 | ||
38 | 19x |
KEYS_ATT <- attr(diff, "keys") |
39 | ||
40 | 19x |
if (is.null(KEYS_ATT)) { |
41 | ! |
stop("diff is missing the keys attribute") |
42 |
} |
|
43 | ||
44 | 19x |
issue_vars <- names(diff)[grep("^VarDiff_", names(diff))] |
45 | ||
46 | 19x |
if (is.null(vars)) { |
47 | 15x |
vars <- issue_vars |
48 |
} else { |
|
49 | 4x |
vars <- paste0("VarDiff_", vars) |
50 |
} |
|
51 | ||
52 | 19x |
if (length(issue_vars) == 0 || sum(vars %in% issue_vars) == 0) { |
53 | 7x |
return(df[FALSE, ]) |
54 |
} |
|
55 | ||
56 | 12x |
KEEP <- mapply( |
57 | 12x |
FUN = get_issue_dataset, |
58 | 12x |
issue = vars, |
59 | 12x |
diff = list(diff), |
60 | 12x |
SIMPLIFY = FALSE |
61 |
) |
|
62 | ||
63 | 12x |
KEEP <- recursive_reduce(KEEP, rbind) |
64 | 12x |
KEEP <- KEEP[!duplicated(KEEP), ] |
65 | ||
66 | 12x |
if (KEYS_ATT$is_derived) { |
67 | 2x |
df[[KEYS_ATT$value]] <- seq_len(nrow(df)) |
68 |
} |
|
69 | ||
70 | 12x |
keys <- KEYS_ATT$value |
71 | ||
72 | 12x |
if (any(! keys %in% names(df))) { |
73 | 1x |
stop("df does not contain all variables specified as keys in diff") |
74 |
} |
|
75 | ||
76 | 11x |
RET <- merge( |
77 | 11x |
x = df, |
78 | 11x |
y = KEEP, |
79 | 11x |
sort = TRUE |
80 |
) |
|
81 | ||
82 | 11x |
RET <- RET[do.call("order", RET[keys]), ] |
83 | ||
84 | 11x |
if (KEYS_ATT$is_derived) { |
85 | 2x |
keep_vars <- !names(RET) %in% KEYS_ATT$value |
86 | 2x |
RET <- RET[, keep_vars, drop = FALSE] |
87 |
} |
|
88 | ||
89 | 11x |
return(RET) |
90 |
} |
|
91 | ||
92 | ||
93 | ||
94 | ||
95 |
#' get_issue_dataset |
|
96 |
#' |
|
97 |
#' Internal function used by `diffdf_issuerows` to extract the dataframe |
|
98 |
#' from each a target issue. In particular it also strips off any |
|
99 |
#' non-key variables |
|
100 |
#' @param issue name of issue to extract the dataset from diff |
|
101 |
#' @param diff diffdf object which contains issues |
|
102 |
get_issue_dataset <- function(issue, diff) { |
|
103 | 20x |
issue_df <- diff[[issue]] |
104 | 20x |
keep <- names(issue_df)[!(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))] |
105 | 20x |
issue_df[, keep, drop = FALSE] |
106 |
} |
1 | ||
2 | ||
3 |
#' Generate unique key name |
|
4 |
#' |
|
5 |
#' Function to generate a name for the keys if not provided |
|
6 |
#' |
|
7 |
#' @param BASE base dataset |
|
8 |
#' @param COMP comparison dataset |
|
9 |
#' @param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity |
|
10 |
#' |
|
11 |
generate_keyname <- function( |
|
12 |
BASE, |
|
13 |
COMP, |
|
14 |
replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..") |
|
15 |
) { |
|
16 | 213x |
if (class(replace_names) != "character") { |
17 | ! |
stop("replace_names is not a character vector") |
18 |
} |
|
19 | ||
20 | 213x |
if (length(replace_names) == 0) { |
21 | 8x |
stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument") |
22 |
} |
|
23 | ||
24 | 205x |
key_name <- replace_names[1] |
25 | ||
26 | 205x |
if (!is.null(BASE[[key_name]]) || !is.null(COMP[[key_name]])) { |
27 | 66x |
key_name <- generate_keyname(BASE, COMP, replace_names[-1]) |
28 |
} |
|
29 | 173x |
return(key_name) |
30 |
} |
1 |
#' construct_issue |
|
2 |
#' |
|
3 |
#' Make an s3 object with class issue and possible additional class, |
|
4 |
#' and assign other arguments to attributes |
|
5 |
#' @param value the value of the object |
|
6 |
#' @param message the value of the message attribute |
|
7 |
#' @param add_class additional class to add |
|
8 |
construct_issue <- function(value, message, add_class = NULL) { |
|
9 | 2302x |
x <- value |
10 | ||
11 |
### If nothing has been provided return nothing ! |
|
12 | 2092x |
if (nrow(x) == 0) return(NULL) |
13 | ||
14 | 210x |
class(x) <- c(add_class, "issue", class(x)) |
15 | 210x |
attributes(x)[["message"]] <- message |
16 | 210x |
return(x) |
17 |
} |
|
18 | ||
19 | ||
20 |
#' get_issue_message |
|
21 |
#' |
|
22 |
#' Simple function to grab the issue message |
|
23 |
#' @param object inputted object of class issue |
|
24 |
#' @param ... other arguments |
|
25 |
get_issue_message <- function(object, ...) { |
|
26 | 210x |
return(attr(object, "message")) |
27 |
} |
|
28 | ||
29 | ||
30 |
#' get_print_message |
|
31 |
#' |
|
32 |
#' Get the required text depending on type of issue |
|
33 |
#' @param object inputted object of class issue |
|
34 |
#' @param ... other arguments |
|
35 |
get_print_message <- function(object, ...) { |
|
36 | 64x |
UseMethod("get_print_message", object) |
37 |
} |
|
38 | ||
39 | ||
40 |
#' get_print_message.default |
|
41 |
#' |
|
42 |
#' Errors, as this should only ever be given an issue |
|
43 |
#' @param object issue |
|
44 |
get_print_message.default <- function(object) { |
|
45 | ! |
stop("Error: An issue has not been provided to this function!") |
46 |
} |
|
47 | ||
48 | ||
49 |
#' get_print_message.issue |
|
50 |
#' |
|
51 |
#' Get text from a basic issue, based on the class of the value of the issue |
|
52 |
#' |
|
53 |
#' @param object an object of class issue_basic |
|
54 |
#' @inheritParams print.diffdf |
|
55 |
get_print_message.issue <- function(object, row_limit, ...) { |
|
56 | 64x |
paste( |
57 | 64x |
c(attr(object, "message"), get_table(object, row_limit = row_limit)), |
58 | 64x |
collapse = "\n" |
59 |
) |
|
60 |
} |
1 | ||
2 |
#' Print diffdf objects |
|
3 |
#' |
|
4 |
#' Print nicely formatted version of an diffdf object |
|
5 |
#' @param x comparison object created by diffdf(). |
|
6 |
#' @param ... Additional arguments (not used) |
|
7 |
#' @param row_limit Max row limit for difference tables (NULL to show all rows) |
|
8 |
#' @param as_string Return printed message as an R character vector? |
|
9 |
#' @examples |
|
10 |
#' x <- subset( iris , -Species ) |
|
11 |
#' x[1,2] <- 5 |
|
12 |
#' COMPARE <- diffdf( iris, x) |
|
13 |
#' print( COMPARE ) |
|
14 |
#' print( COMPARE, row_limit = 5 ) |
|
15 |
#' @export |
|
16 |
print.diffdf <- function(x, row_limit = 10, as_string = FALSE, ...) { |
|
17 | 31x |
if (!is.null(row_limit)) { |
18 | 30x |
if (length(row_limit) != 1) { |
19 | 1x |
stop("row_limit should have a length of 1") |
20 |
} |
|
21 | 29x |
if (!is.numeric(row_limit)) { |
22 | 2x |
stop("row_limit should be a numeric value or NULL") |
23 |
} |
|
24 | 27x |
if (row_limit <= 0) { |
25 | 1x |
stop("row_limit should be a positive integer") |
26 |
} |
|
27 |
} |
|
28 | 27x |
if (!is.logical(as_string) || length(as_string) != 1) { |
29 | 2x |
stop("as_string should be a logical of length one") |
30 |
} |
|
31 | 25x |
COMPARE <- x |
32 | ||
33 | 25x |
if (length(COMPARE) == 0) { |
34 | 2x |
outtext <- "No issues were found!\n" |
35 |
} else { |
|
36 | 23x |
start_text <- paste0( |
37 | 23x |
"Differences found between the objects!\n\n", |
38 | 23x |
"A summary is given below.\n\n" |
39 |
) |
|
40 | 23x |
end_text <- lapply(COMPARE, function(x) get_print_message(x, row_limit)) |
41 | 23x |
end_text <- paste0(unlist(end_text), collapse = "") |
42 | 23x |
outtext <- paste0(start_text, end_text) |
43 |
} |
|
44 | 25x |
if (as_string) { |
45 | 25x |
return(strsplit(outtext, "\n")[[1]]) |
46 |
} else { |
|
47 | ! |
cat(outtext) |
48 | ! |
return(invisible(COMPARE)) |
49 |
} |
|
50 |
} |
1 | ||
2 | ||
3 | ||
4 |
#' factor_to_character |
|
5 |
#' |
|
6 |
#' Takes a dataframe and converts any factor variables to character |
|
7 |
#' @param dsin input dataframe |
|
8 |
#' @param vars variables to consider for conversion. Default NULL will consider |
|
9 |
#' every variable within the dataset |
|
10 |
factor_to_character <- function(dsin, vars = NULL) { |
|
11 | ||
12 | ! |
if (is.null(vars)) vars <- names(dsin) |
13 | ||
14 | 262x |
for (var in vars){ |
15 | 276x |
if (is.factor(dsin[[var]])) { |
16 | ! |
dsin[[var]] <- as.character(dsin[[var]]) |
17 |
} |
|
18 |
} |
|
19 | 262x |
return(dsin) |
20 |
} |
|
21 | ||
22 | ||
23 | ||
24 | ||
25 |
#' has_unique_rows |
|
26 |
#' |
|
27 |
#' Check if a data sets rows are unique |
|
28 |
#' @param DAT input data set (data frame) |
|
29 |
#' @param KEYS Set of keys which should be unique |
|
30 |
has_unique_rows <- function(DAT, KEYS) { |
|
31 | 263x |
DUPS <- duplicated(subset(DAT, select = KEYS)) |
32 | 263x |
NDUPS <- sum(DUPS) |
33 | 263x |
return(NDUPS == 0) |
34 |
} |
|
35 | ||
36 |
#'convert_to_issue |
|
37 |
#' |
|
38 |
#'converts the count value into the correct issue format |
|
39 |
#'@param datin data inputted |
|
40 |
#'@importFrom tibble rownames_to_column |
|
41 |
convert_to_issue <- function(datin) { |
|
42 | 130x |
datin_tibble <- tibble( |
43 | 130x |
`Variable` = names(datin), |
44 | 130x |
`No of Differences` = datin |
45 |
) |
|
46 | ||
47 | 130x |
datin_tibble_reduced <- datin_tibble[datin_tibble[["No of Differences"]] > 0, , drop = FALSE] |
48 | 130x |
return(datin_tibble_reduced) |
49 |
} |