1 | ||
2 | ||
3 |
#' identify_extra_rows |
|
4 |
#' |
|
5 |
#' Identifies rows that are in a baseline dataset but not in a comparator dataset |
|
6 |
#' @param DS1 Baseline dataset (data frame) |
|
7 |
#' @param DS2 Comparator dataset (data frame) |
|
8 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
9 |
identify_extra_rows <- function(DS1, DS2 , KEYS){ |
|
10 | 258x |
DS2[["..FLAG.."]] <- "Y" |
11 | 258x |
dat <- merge( |
12 | 258x |
subset( DS1 , select = KEYS) , |
13 | 258x |
subset( DS2 , select = c(KEYS, "..FLAG..")) , |
14 | 258x |
by = KEYS, all.x = T, |
15 | 258x |
sort = TRUE |
16 |
) |
|
17 | 258x |
dat <- dat[do.call("order", dat[KEYS]), ] |
18 |
|
|
19 | 258x |
dat[ is.na(dat[["..FLAG.."]]) , KEYS, drop=FALSE] |
20 |
} |
|
21 | ||
22 | ||
23 | ||
24 |
#' identify_extra_cols |
|
25 |
#' |
|
26 |
#' Identifies columns that are in a baseline dataset but not in a comparator dataset |
|
27 |
#' @param DS1 Baseline dataset (data frame) |
|
28 |
#' @param DS2 Comparator dataset (data frame) |
|
29 |
#' @importFrom tibble tibble |
|
30 |
identify_extra_cols <- function(DS1 , DS2){ |
|
31 | 258x |
match.cols <- sapply ( names(DS1), "%in%", names(DS2)) |
32 | 258x |
if ( !all(is.logical(match.cols)) ){ |
33 | ! |
stop("Assumption of logical return type is not true") |
34 |
} |
|
35 | 258x |
tibble( |
36 | 258x |
COLUMNS = names(DS1)[ !match.cols] |
37 |
) |
|
38 |
} |
|
39 | ||
40 | ||
41 | ||
42 | ||
43 | ||
44 | ||
45 | ||
46 |
#' identify_matching_cols |
|
47 |
#' |
|
48 |
#' Identifies columns with the same name in two data frames |
|
49 |
#' @param DS1 Input dataset 1 (data frame) |
|
50 |
#' @param DS2 Input dataset 2 (data frame) |
|
51 |
#' @param EXCLUDE Columns to ignore |
|
52 |
identify_matching_cols <- function(DS1, DS2 , EXCLUDE = ""){ |
|
53 | 826x |
match_cols <- sapply( names(DS1), "%in%" , names(DS2)) |
54 | 826x |
exclude_cols <- sapply( names(DS1), "%in%" , EXCLUDE) |
55 | 826x |
names(DS1)[ match_cols & !exclude_cols ] |
56 |
} |
|
57 | ||
58 | ||
59 | ||
60 | ||
61 | ||
62 |
#' identify_unsupported_cols |
|
63 |
#' |
|
64 |
#' Identifies any columns for which the package is not setup to handle |
|
65 |
#' @param dsin input dataset |
|
66 |
identify_unsupported_cols <- function(dsin){ |
|
67 | 258x |
dat <- subset( |
68 | 258x |
identify_properties(dsin) , |
69 | 258x |
select = c("VARIABLE", "MODE") |
70 |
) |
|
71 |
|
|
72 | 258x |
dat[ !dat[["MODE"]] %in% c('numeric', 'character', 'logical') ,, drop=FALSE] |
73 | ||
74 |
} |
|
75 | ||
76 | ||
77 | ||
78 |
#' identify_mode_differences |
|
79 |
#' |
|
80 |
#' Identifies any mode differences between two data frames |
|
81 |
#' @param BASE Base dataset for comparison (data.frame) |
|
82 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
83 |
identify_mode_differences <- function( BASE, COMP ){ |
|
84 | ||
85 | 429x |
matching_cols <- identify_matching_cols( BASE , COMP ) |
86 |
|
|
87 | 429x |
dat <- merge( |
88 | 429x |
x = identify_properties(BASE), |
89 | 429x |
y = identify_properties(COMP), |
90 | 429x |
by = "VARIABLE", |
91 | 429x |
all = TRUE, |
92 | 429x |
suffixes = c(".BASE", ".COMP"), |
93 | 429x |
sort = TRUE |
94 |
) |
|
95 | 429x |
dat <- subset( dat, select = c("VARIABLE" , "MODE.BASE" , "MODE.COMP")) |
96 |
|
|
97 | 429x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
98 | 429x |
KEEP2 <- dat[["MODE.BASE"]] != dat[["MODE.COMP"]] |
99 |
|
|
100 | 429x |
dat[ KEEP1 & KEEP2 ,, drop=FALSE] |
101 | ||
102 |
} |
|
103 | ||
104 | ||
105 | ||
106 |
#' identify_class_differences |
|
107 |
#' |
|
108 |
#' Identifies any class differences between two data frames |
|
109 |
#' @param BASE Base dataset for comparison (data.frame) |
|
110 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
111 |
identify_class_differences <- function( BASE, COMP ){ |
|
112 |
|
|
113 | 129x |
matching_cols <- identify_matching_cols( BASE , COMP ) |
114 |
|
|
115 | 129x |
dat <- merge( |
116 | 129x |
x = identify_properties(BASE), |
117 | 129x |
y = identify_properties(COMP), |
118 | 129x |
by = "VARIABLE", |
119 | 129x |
all = TRUE, |
120 | 129x |
sort = TRUE, |
121 | 129x |
suffixes = c(".BASE", ".COMP") |
122 |
) |
|
123 |
|
|
124 | 129x |
dat <- subset( dat , select = c("VARIABLE" , "CLASS.BASE" , "CLASS.COMP")) |
125 |
|
|
126 | 129x |
KEEP1 <- dat[["VARIABLE"]] %in% matching_cols |
127 | 129x |
KEEP2 <- !mapply( |
128 | 129x |
identical, |
129 | 129x |
dat[["CLASS.BASE"]] , |
130 | 129x |
dat[["CLASS.COMP"]] |
131 |
) |
|
132 |
|
|
133 | 129x |
dat[ KEEP1 & KEEP2 ,, drop=FALSE] |
134 | ||
135 |
} |
|
136 | ||
137 | ||
138 | ||
139 |
#' identify_att_differences |
|
140 |
#' |
|
141 |
#' Identifies any attribute differences between two data frames |
|
142 |
#' @param BASE Base dataset for comparison (data.frame) |
|
143 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
144 |
#' @param exclude_cols Columns to exclude from comparison |
|
145 |
#' @importFrom tibble tibble |
|
146 |
identify_att_differences <- function( BASE, COMP , exclude_cols = "" ){ |
|
147 |
|
|
148 | 139x |
matching_cols <- identify_matching_cols( BASE , COMP , exclude_cols ) |
149 |
|
|
150 | 139x |
PROPS <- merge( |
151 | 139x |
x = identify_properties(BASE) , |
152 | 139x |
y = identify_properties(COMP) , |
153 | 139x |
by = "VARIABLE", |
154 | 139x |
all = TRUE, |
155 | 139x |
sort = TRUE, |
156 | 139x |
suffixes = c(".BASE", ".COMP") |
157 |
) |
|
158 |
|
|
159 | 139x |
PROPS <- subset( PROPS , select = c("VARIABLE", "ATTRIBS.BASE" , "ATTRIBS.COMP")) |
160 |
|
|
161 | 139x |
PROPS <- PROPS[ PROPS[["VARIABLE"]] %in% matching_cols,, drop = FALSE] |
162 |
|
|
163 |
|
|
164 |
### Setup dummy return value |
|
165 | 139x |
RETURN <- tibble( |
166 | 139x |
VARIABLE = character(), |
167 | 139x |
ATTR_NAME = character(), |
168 | 139x |
VALUES.BASE = list(), |
169 | 139x |
VALUES.COMP = list() |
170 |
) |
|
171 |
|
|
172 | 139x |
for ( i in PROPS[["VARIABLE"]] ){ |
173 |
|
|
174 | 1237x |
PROPS_filt <- PROPS[ PROPS[["VARIABLE"]] == i ,, drop=FALSE] |
175 | ||
176 |
### Get a vector of all available attributes across both variables |
|
177 | 1237x |
ATTRIB_NAMES = unique(c( |
178 | 1237x |
names(PROPS_filt[["ATTRIBS.BASE"]][[1]]) , |
179 | 1237x |
names(PROPS_filt[["ATTRIBS.COMP"]][[1]]) |
180 |
)) |
|
181 |
|
|
182 |
### If variable has no attributes move onto the next variable |
|
183 | 904x |
if ( is.null(ATTRIB_NAMES) ) next() |
184 |
|
|
185 |
### Loop over each attribute checking if they are identical and outputing |
|
186 |
### anyones that arn't |
|
187 | 333x |
for ( j in ATTRIB_NAMES){ |
188 |
|
|
189 | 574x |
ATTRIB_BASE = PROPS_filt[["ATTRIBS.BASE"]][[1]][j] |
190 | 574x |
ATTRIB_COMP = PROPS_filt[["ATTRIBS.COMP"]][[1]][j] |
191 |
|
|
192 | 574x |
if ( !identical(ATTRIB_BASE , ATTRIB_COMP) ){ |
193 |
|
|
194 | 89x |
ATT_DIFFS <- tibble( |
195 | 89x |
VARIABLE = i , |
196 | 89x |
ATTR_NAME = j , |
197 | 89x |
VALUES.BASE = ifelse( is.null(ATTRIB_BASE) , list() , ATTRIB_BASE), |
198 | 89x |
VALUES.COMP = ifelse( is.null(ATTRIB_COMP) , list() , ATTRIB_COMP) |
199 |
) |
|
200 |
|
|
201 | 89x |
RETURN <- rbind(RETURN , ATT_DIFFS) |
202 |
} |
|
203 |
} |
|
204 |
} |
|
205 | 139x |
return(RETURN) |
206 |
} |
|
207 | ||
208 | ||
209 | ||
210 | ||
211 | ||
212 |
#' identify_differences |
|
213 |
#' |
|
214 |
#' Compares each column within 2 datasets to identify any values which they |
|
215 |
#' mismatch on. |
|
216 |
#' @param BASE Base dataset for comparison (data.frame) |
|
217 |
#' @param COMP Comparator dataset to compare base against (data.frame) |
|
218 |
#' @param KEYS List of variables that define a unique row within the datasets (strings) |
|
219 |
#' @param exclude_cols Columns to exclude from comparison |
|
220 |
#' @param tolerance Level of tolerance for numeric differences between two variables |
|
221 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
222 |
identify_differences <- function( BASE , COMP , KEYS, exclude_cols, |
|
223 |
tolerance = sqrt(.Machine$double.eps), |
|
224 |
scale = NULL ) { |
|
225 |
|
|
226 | 129x |
matching_cols <- identify_matching_cols( BASE , COMP , c(KEYS, exclude_cols)) |
227 |
|
|
228 | 1x |
if( length(matching_cols) == 0 ) return ( tibble() ) |
229 |
|
|
230 | 128x |
DAT = merge( |
231 | 128x |
x = BASE , |
232 | 128x |
y = COMP , |
233 | 128x |
by = KEYS , |
234 | 128x |
suffix = c(".x", ".y"), |
235 | 128x |
sort = TRUE |
236 |
) |
|
237 | 128x |
DAT <- DAT[do.call("order", DAT[KEYS]), ] |
238 |
|
|
239 | ||
240 | 128x |
matching_list <- mapply( |
241 | 128x |
is_variable_different , |
242 | 128x |
matching_cols, |
243 | 128x |
MoreArgs = list( |
244 | 128x |
keynames = KEYS, |
245 | 128x |
datain = DAT, |
246 | 128x |
tolerance = tolerance , |
247 | 128x |
scale = scale |
248 |
), |
|
249 | 128x |
SIMPLIFY = FALSE |
250 |
) |
|
251 |
|
|
252 | 128x |
matching_list |
253 |
} |
|
254 | ||
255 | ||
256 | ||
257 | ||
258 | ||
259 | ||
260 | ||
261 | ||
262 | ||
263 |
#' identify_properties |
|
264 |
#' |
|
265 |
#' Returns a dataframe of metadata for a given dataset. |
|
266 |
#' Returned values include variable names , class , mode , type & attributes |
|
267 |
#' @param dsin input dataframe that you want to get the metadata from |
|
268 |
#' @importFrom tibble tibble |
|
269 |
identify_properties <- function(dsin){ |
|
270 |
|
|
271 |
### If missing or null return empty dataset |
|
272 | 1652x |
if( is.null(dsin) ) { |
273 | ! |
x <- tibble( |
274 | ! |
VARIABLE = character(), |
275 | ! |
CLASS = list(), |
276 | ! |
MODE = character(), |
277 | ! |
TYPE = character() , |
278 | ! |
ATTRIBS = list() |
279 |
) |
|
280 | ! |
return(x) |
281 |
} |
|
282 |
|
|
283 | 1652x |
tibble( |
284 | 1652x |
VARIABLE = names(dsin), |
285 | 1652x |
CLASS = lapply(dsin, class), |
286 | 1652x |
MODE = sapply(dsin , mode), |
287 | 1652x |
TYPE = sapply(dsin , typeof) , |
288 | 1652x |
ATTRIBS = lapply( dsin , attributes) |
289 |
) |
|
290 |
} |
|
291 | ||
292 | ||
293 | ||
294 | ||
295 | ||
296 |
1 |
#' diffdf |
|
2 |
#' @description |
|
3 |
#' Compares 2 dataframes and outputs any differences. |
|
4 |
#' @param base input dataframe |
|
5 |
#' @param compare comparison dataframe |
|
6 |
#' @param keys vector of variables (as strings) that defines a unique row in the base and compare dataframes |
|
7 |
#' @param strict_numeric Flag for strict numeric to numeric comparisons (default = TRUE). If False diffdf will cast integer to double where required for comparisons. Note that variables specified in the keys will never be casted. |
|
8 |
#' @param strict_factor Flag for strict factor to character comparisons (default = TRUE). If False diffdf will cast factors to characters where required for comparisons. Note that variables specified in the keys will never be casted. |
|
9 |
#' @param suppress_warnings Do you want to suppress warnings? (logical) |
|
10 |
#' @param file Location and name of a text file to output the results to. Setting to NULL will cause no file to be produced. |
|
11 |
#' @param tolerance Set tolerance for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance. |
|
12 |
#' @param scale Set scale for numeric comparisons. Note that comparisons fail if (x-y)/scale > tolerance. Setting as NULL is a slightly more efficient version of scale = 1. |
|
13 |
#' @examples |
|
14 |
#' x <- subset( iris, -Species) |
|
15 |
#' x[1,2] <- 5 |
|
16 |
#' COMPARE <- diffdf( iris, x) |
|
17 |
#' print( COMPARE ) |
|
18 |
#' print( COMPARE , "Sepal.Length" ) |
|
19 |
#' |
|
20 |
#' #### Sample data frames |
|
21 |
#' |
|
22 |
#' DF1 <- data.frame( |
|
23 |
#' id = c(1,2,3,4,5,6), |
|
24 |
#' v1 = letters[1:6], |
|
25 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA) |
|
26 |
#' ) |
|
27 |
#' |
|
28 |
#' DF2 <- data.frame( |
|
29 |
#' id = c(1,2,3,4,5,7), |
|
30 |
#' v1 = letters[1:6], |
|
31 |
#' v2 = c(NA , NA , 1 , 2 , NA , NA), |
|
32 |
#' v3 = c(NA , NA , 1 , 2 , NA , 4) |
|
33 |
#' ) |
|
34 |
#' |
|
35 |
#' diffdf(DF1 , DF1 , keys = "id") |
|
36 |
#' |
|
37 |
#' # We can control matching with scale/location for example: |
|
38 |
#' |
|
39 |
#' DF1 <- data.frame( |
|
40 |
#' id = c(1,2,3,4,5,6), |
|
41 |
#' v1 = letters[1:6], |
|
42 |
#' v2 = c(1,2,3,4,5,6) |
|
43 |
#' ) |
|
44 |
#' DF2 <- data.frame( |
|
45 |
#' id = c(1,2,3,4,5,6), |
|
46 |
#' v1 = letters[1:6], |
|
47 |
#' v2 = c(1.1,2,3,4,5,6) |
|
48 |
#' ) |
|
49 |
#' |
|
50 |
#' diffdf(DF1 , DF2 , keys = "id") |
|
51 |
#' diffdf(DF1 , DF2 , keys = "id", tolerance = 0.2) |
|
52 |
#' diffdf(DF1 , DF2 , keys = "id", scale = 10, tolerance = 0.2) |
|
53 |
#' |
|
54 |
#' # We can use strict_factor to compare factors with characters for example: |
|
55 |
#' |
|
56 |
#' DF1 <- data.frame( |
|
57 |
#' id = c(1,2,3,4,5,6), |
|
58 |
#' v1 = letters[1:6], |
|
59 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA), |
|
60 |
#' stringsAsFactors = FALSE |
|
61 |
#' ) |
|
62 |
#' |
|
63 |
#' DF2 <- data.frame( |
|
64 |
#' id = c(1,2,3,4,5,6), |
|
65 |
#' v1 = letters[1:6], |
|
66 |
#' v2 = c(NA , NA , 1 , 2 , 3 , NA) |
|
67 |
#' ) |
|
68 |
#' |
|
69 |
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = TRUE) |
|
70 |
#' diffdf(DF1 , DF2 , keys = "id", strict_factor = FALSE) |
|
71 |
#' |
|
72 |
#' @export |
|
73 |
diffdf <- function ( |
|
74 |
base , |
|
75 |
compare , |
|
76 |
keys = NULL, |
|
77 |
suppress_warnings = FALSE, |
|
78 |
strict_numeric = TRUE, |
|
79 |
strict_factor = TRUE, |
|
80 |
file = NULL, |
|
81 |
tolerance = sqrt(.Machine$double.eps), |
|
82 |
scale = NULL |
|
83 |
){ |
|
84 |
|
|
85 | 134x |
BASE = base |
86 | 134x |
COMP = compare |
87 | 134x |
KEYS = keys |
88 | 134x |
SUPWARN = suppress_warnings |
89 |
|
|
90 |
|
|
91 |
### Initatiate output object |
|
92 | 134x |
COMPARE <- list() |
93 | 134x |
class(COMPARE) <- c("diffdf" , "list") |
94 |
|
|
95 | 134x |
is_derived <- FALSE |
96 |
|
|
97 |
### If no key is suplied match values based upon row number |
|
98 | 134x |
if (is.null(KEYS)){ |
99 | 122x |
is_derived <- TRUE |
100 | 122x |
keyname <- generate_keyname(BASE, COMP) |
101 | 122x |
BASE[[keyname]] <- 1:nrow(BASE) |
102 | 122x |
COMP[[keyname]] <- 1:nrow(COMP) |
103 | 122x |
KEYS <- keyname |
104 |
} |
|
105 | 134x |
attr(COMPARE, 'keys') <- list(value = KEYS, is_derived = is_derived) |
106 |
|
|
107 |
|
|
108 |
|
|
109 | 134x |
if (!is.numeric(tolerance)) { |
110 | 2x |
stop("'tolerance' should be numeric") |
111 |
} |
|
112 |
|
|
113 | 132x |
if (!is.numeric(scale) && !is.null(scale)) { |
114 | 2x |
stop("'scale' should be numeric or NULL") |
115 |
} |
|
116 |
|
|
117 | ||
118 |
|
|
119 | 130x |
if ( !has_unique_rows(BASE , KEYS) ){ |
120 | 1x |
stop( "BY variables in BASE do not result in unique observations") |
121 |
} |
|
122 |
|
|
123 | 129x |
if ( !has_unique_rows(COMP , KEYS) ){ |
124 | ! |
stop( "BY variables in COMPARE do not result in unique observations") |
125 |
} |
|
126 |
|
|
127 |
|
|
128 | ||
129 |
#### Check essential variable properties (class & mode) |
|
130 |
|
|
131 | 129x |
COMPARE[["UnsupportedColsBase"]] <- construct_issue( |
132 | 129x |
value = identify_unsupported_cols(BASE) , |
133 | 129x |
message = "There are columns in BASE with unsupported modes !!" |
134 |
) |
|
135 |
|
|
136 |
|
|
137 | 129x |
COMPARE[["UnsupportedColsComp"]] <- construct_issue( |
138 | 129x |
value = identify_unsupported_cols(COMP) , |
139 | 129x |
message = "There are columns in COMPARE with unsupported modes !!" |
140 |
) |
|
141 | ||
142 |
|
|
143 |
# cast variables if strict is off |
|
144 | 129x |
if ( !strict_factor | !strict_numeric ){ |
145 |
|
|
146 | 9x |
casted_df <- cast_variables( |
147 | 9x |
BASE = BASE, |
148 | 9x |
COMPARE = COMP, |
149 | 9x |
ignore_vars = KEYS, |
150 | 9x |
cast_integers = !strict_numeric , |
151 | 9x |
cast_factors = !strict_factor |
152 |
) |
|
153 |
|
|
154 | 9x |
BASE <- casted_df$BASE |
155 | 9x |
COMP <- casted_df$COMP |
156 |
|
|
157 |
} |
|
158 |
|
|
159 |
|
|
160 | 129x |
COMPARE[["VarModeDiffs"]] <- construct_issue( |
161 | 129x |
value = identify_mode_differences( BASE, COMP ) , |
162 | 129x |
message = "There are columns in BASE and COMPARE with different modes !!" |
163 |
) |
|
164 |
|
|
165 |
|
|
166 | 129x |
COMPARE[["VarClassDiffs"]] <- construct_issue( |
167 | 129x |
value = identify_class_differences(BASE, COMP) , |
168 | 129x |
message = "There are columns in BASE and COMPARE with different classes !!" |
169 |
) |
|
170 |
|
|
171 | ||
172 | 129x |
exclude_cols <- c( |
173 | 129x |
COMPARE[["UnsupportedColsBase"]]$VARIABLE , |
174 | 129x |
COMPARE[["UnsupportedColsComp"]]$VARIABLE, |
175 | 129x |
COMPARE[["VarClassDiffs"]]$VARIABLE, |
176 | 129x |
COMPARE[["VarModeDiffs"]]$VARIABLE |
177 |
) |
|
178 |
|
|
179 | ||
180 |
##### Check Validity of Keys |
|
181 |
|
|
182 | 129x |
BASE_keys <- names(BASE)[names(BASE) %in% KEYS] |
183 | 129x |
COMP_keys <- names(COMP)[names(COMP) %in% KEYS] |
184 |
|
|
185 |
|
|
186 | 129x |
if ( length(BASE_keys) != length(KEYS) ){ |
187 | ! |
stop( "BASE is missing variables specified in KEYS") |
188 |
} |
|
189 |
|
|
190 | 129x |
if ( length(COMP_keys) != length(KEYS) ){ |
191 | ! |
stop( "COMP is missing variables specified in KEYS") |
192 |
} |
|
193 |
|
|
194 | 129x |
if( any(KEYS %in% exclude_cols)){ |
195 | ! |
stop("KEYS are either an invalid or contain different modes between BASE and COMP") |
196 |
} |
|
197 |
|
|
198 | ||
199 |
##### Check Attributes |
|
200 | ||
201 |
|
|
202 | 129x |
COMPARE[["AttribDiffs"]] <- construct_issue( |
203 | 129x |
value = identify_att_differences(BASE, COMP , exclude_cols) , |
204 | 129x |
message = "There are columns in BASE and COMPARE with differing attributes !!" |
205 |
) |
|
206 |
|
|
207 |
|
|
208 |
##### Check data |
|
209 |
|
|
210 | 129x |
BASE <- factor_to_character(BASE , KEYS) |
211 | 129x |
COMP <- factor_to_character(COMP , KEYS) |
212 |
|
|
213 |
|
|
214 | 129x |
COMPARE[["ExtRowsBase"]] <- construct_issue( |
215 | 129x |
value = identify_extra_rows( BASE, COMP, KEYS ) , |
216 | 129x |
message = "There are rows in BASE that are not in COMPARE !!" |
217 |
) |
|
218 |
|
|
219 |
|
|
220 | 129x |
COMPARE[["ExtRowsComp"]] <- construct_issue( |
221 | 129x |
value = identify_extra_rows( COMP, BASE, KEYS ) , |
222 | 129x |
message = "There are rows in COMPARE that are not in BASE !!" |
223 |
) |
|
224 |
|
|
225 |
|
|
226 |
|
|
227 | 129x |
COMPARE[["ExtColsBase"]] <- construct_issue( |
228 | 129x |
value = identify_extra_cols(BASE, COMP) , |
229 | 129x |
message = "There are columns in BASE that are not in COMPARE !!" |
230 |
) |
|
231 |
|
|
232 |
|
|
233 | 129x |
COMPARE[["ExtColsComp"]] <- construct_issue( |
234 | 129x |
value = identify_extra_cols(COMP, BASE) , |
235 | 129x |
message = "There are columns in COMPARE that are not in BASE !!" |
236 |
) |
|
237 |
|
|
238 |
|
|
239 | 129x |
VALUE_DIFFERENCES <- identify_differences( |
240 | 129x |
BASE, COMP , KEYS, exclude_cols, |
241 | 129x |
tolerance = tolerance, |
242 | 129x |
scale = scale |
243 |
) |
|
244 | ||
245 |
|
|
246 |
|
|
247 |
## Summarise the number of mismatching rows per variable |
|
248 | ||
249 | 129x |
if ( length(VALUE_DIFFERENCES) ){ |
250 | 128x |
NDIFF <- sapply( VALUE_DIFFERENCES , nrow ) |
251 | 128x |
COMPARE[["NumDiff"]] <- construct_issue( |
252 | 128x |
value = convert_to_issue(NDIFF), |
253 | 128x |
message = "Not all Values Compared Equal" |
254 |
) |
|
255 |
} |
|
256 | ||
257 | ||
258 | 129x |
for ( i in names(VALUE_DIFFERENCES) ){ |
259 | 991x |
COMPARE[[ paste0( "VarDiff_", i)]] <- construct_issue( |
260 | 991x |
value = VALUE_DIFFERENCES[[i]] , |
261 | 991x |
message = "" |
262 |
) |
|
263 |
} |
|
264 |
|
|
265 |
## Get all issue messages, remove blank message, and collapse into single string |
|
266 | 129x |
ISSUE_MSGS <- sapply(COMPARE, function(x) get_issue_message(x)) |
267 | 129x |
ISSUE_MSGS <- ISSUE_MSGS[ ISSUE_MSGS != ""] |
268 |
|
|
269 | 129x |
if( length(ISSUE_MSGS) != 0 ){ |
270 | 95x |
if(!SUPWARN) { |
271 | 36x |
ISSUE_MSGS <- paste(ISSUE_MSGS, collapse ='\n' ) |
272 | 36x |
warning( c("\n" , ISSUE_MSGS)) |
273 |
} |
|
274 |
} |
|
275 |
|
|
276 |
|
|
277 | 129x |
if (!is.null(file)){ |
278 | ! |
x <- print(COMPARE , as_string = TRUE) |
279 |
|
|
280 | ! |
tryCatch( |
281 |
{ |
|
282 | ! |
sink(file) |
283 | ! |
cat(x, sep = "\n") |
284 | ! |
sink() |
285 |
}, |
|
286 | ! |
warning = function(w){ |
287 | ! |
sink() |
288 | ! |
warning(w) |
289 |
}, |
|
290 | ! |
error = function(e){ |
291 | ! |
sink() |
292 | ! |
stop(e) |
293 |
} |
|
294 |
) |
|
295 | ! |
return(invisible(COMPARE)) |
296 |
|
|
297 |
} |
|
298 |
|
|
299 | 129x |
return(COMPARE) |
300 |
} |
|
301 | ||
302 | ||
303 | ||
304 | ||
305 |
#' diffdf_has_issues |
|
306 |
#' |
|
307 |
#' Utility function which returns TRUE if an diffdf |
|
308 |
#' object has issues or FALSE if an diffdf object does not have issues |
|
309 |
#' @param x diffdf object |
|
310 |
#' @examples |
|
311 |
#' |
|
312 |
#' # Example with no issues |
|
313 |
#' x <- diffdf( iris, iris ) |
|
314 |
#' diffdf_has_issues(x) |
|
315 |
#' |
|
316 |
#' # Example with issues |
|
317 |
#' iris2 <- iris |
|
318 |
#' iris2[2,2] <- NA |
|
319 |
#' x <- diffdf( iris , iris2 , suppress_warnings = TRUE) |
|
320 |
#' diffdf_has_issues(x) |
|
321 |
#' @export |
|
322 |
diffdf_has_issues <- function(x){ |
|
323 | ! |
if ( class(x)[[1]] != "diffdf" ) stop( "x is not an diffdf object") |
324 | 21x |
return( length(x) != 0 ) |
325 |
} |
|
326 |
1 | ||
2 | ||
3 | ||
4 | ||
5 | ||
6 |
#' diffdf_issuerows |
|
7 |
#' |
|
8 |
#' This function takes a diffdf object and a dataframe and subsets |
|
9 |
#' the dataframe for problem rows as identified in the comparison object. |
|
10 |
#' If \code{vars} has been specified only issue rows associated with those |
|
11 |
#' variable(s) will be returned. |
|
12 |
#' @param df dataframe to be subsetted |
|
13 |
#' @param diff diffdf object |
|
14 |
#' @param vars (optional) character vector containing names of issue variables to subset dataframe |
|
15 |
#' on. A value of NULL (default) will be taken to mean available issue variables. |
|
16 |
#' @examples |
|
17 |
#' iris2 <- iris |
|
18 |
#' for ( i in 1:3) iris2[i,i] <- 99 |
|
19 |
#' x <- diffdf( iris , iris2, suppress_warnings = TRUE) |
|
20 |
#' diffdf_issuerows( iris , x) |
|
21 |
#' diffdf_issuerows( iris2 , x) |
|
22 |
#' diffdf_issuerows( iris2 , x , vars = "Sepal.Length") |
|
23 |
#' diffdf_issuerows( iris2 , x , vars = c("Sepal.Length" , "Sepal.Width")) |
|
24 |
#' @details |
|
25 |
#' Note that diffdf_issuerows can be used to subset against any dataframe. The only |
|
26 |
#' requirement is that the original variables specified in the keys argument to diffdf |
|
27 |
#' are present on the dataframe you are subsetting against. However please note that if |
|
28 |
#' no keys were specified in diffdf then the row number is used. This means using |
|
29 |
#' diffdf_issuerows without a keys against an arbitrary dataset can easily result in |
|
30 |
#' nonsense rows being returned. It is always recommended to supply keys to diffdf. |
|
31 |
#' @export |
|
32 |
diffdf_issuerows <- function( df , diff, vars = NULL){ |
|
33 | ||
34 | 19x |
if ( class(diff)[[1]] != "diffdf") { |
35 | ! |
stop("diff should be an diffdf object") |
36 |
} |
|
37 |
|
|
38 | 19x |
KEYS_ATT = attr(diff, "keys") |
39 |
|
|
40 | 19x |
if( is.null(KEYS_ATT) ) { |
41 | ! |
stop("diff is missing the keys attribute") |
42 |
} |
|
43 |
|
|
44 | 19x |
issue_vars <- names(diff)[grep( "^VarDiff_", names(diff))] |
45 |
|
|
46 | 19x |
if ( is.null(vars)){ |
47 | 15x |
vars <- issue_vars |
48 |
} else { |
|
49 | 4x |
vars <- paste0("VarDiff_" , vars) |
50 |
} |
|
51 |
|
|
52 | 19x |
if ( length(issue_vars) == 0 | sum( vars %in% issue_vars) == 0){ |
53 | 7x |
return(df[FALSE,]) |
54 |
} |
|
55 |
|
|
56 | 12x |
KEEP <- mapply( |
57 | 12x |
FUN = get_issue_dataset, |
58 | 12x |
issue = vars, |
59 | 12x |
diff = list(diff) , |
60 | 12x |
SIMPLIFY = F |
61 |
) |
|
62 |
|
|
63 | 12x |
KEEP <- recursive_reduce( KEEP , rbind) |
64 | 12x |
KEEP <- KEEP[!duplicated(KEEP),] |
65 |
|
|
66 | 12x |
if ( KEYS_ATT$is_derived ){ |
67 | 2x |
df[[KEYS_ATT$value]] <- 1:nrow(df) |
68 |
} |
|
69 | ||
70 | 12x |
keys <- KEYS_ATT$value |
71 |
|
|
72 | 12x |
if ( any( ! keys %in% names(df))){ |
73 | 1x |
stop("df does not contain all variables specified as keys in diff") |
74 |
} |
|
75 |
|
|
76 | 11x |
RET <- merge( |
77 | 11x |
x = df, |
78 | 11x |
y = KEEP, |
79 | 11x |
sort = TRUE |
80 |
) |
|
81 |
|
|
82 | 11x |
RET <- RET[do.call("order", RET[keys]), ] |
83 |
|
|
84 | 11x |
if ( KEYS_ATT$is_derived ){ |
85 | 2x |
keep_vars <- !names(RET) %in% KEYS_ATT$value |
86 | 2x |
RET <- RET[, keep_vars , drop = FALSE] |
87 |
} |
|
88 |
|
|
89 | 11x |
return(RET) |
90 |
} |
|
91 | ||
92 | ||
93 | ||
94 | ||
95 |
#' get_issue_dataset |
|
96 |
#' |
|
97 |
#' Internal function used by diffdf_issuerows to extract the dataframe |
|
98 |
#' from each a target issue. In particular it also strips off any |
|
99 |
#' non-key variables |
|
100 |
#' @param issue name of issue to extract the dataset from diff |
|
101 |
#' @param diff diffdf object which contains issues |
|
102 |
get_issue_dataset <- function(issue, diff){ |
|
103 | 20x |
issue_df <- diff[[issue]] |
104 | 20x |
keep <- names(issue_df)[ !(names(issue_df) %in% c("BASE", "COMPARE", "VARIABLE"))] |
105 | 20x |
issue_df[,keep , drop=FALSE] |
106 |
} |
1 | ||
2 | ||
3 |
#' string_pad |
|
4 |
#' |
|
5 |
#' Utility function used to replicate str_pad. Adds white space to either end |
|
6 |
#' of a string to get it to equal the desired length |
|
7 |
#' @param x string |
|
8 |
#' @param width desired length |
|
9 |
string_pad <- function(x , width){ |
|
10 | ! |
if ( nchar(x) >= width) return(x) |
11 | 814x |
width <- width - nchar(x) |
12 | 814x |
left <- paste0( rep ( " " , floor( width/2) ) , collapse= "") |
13 | 814x |
right <- paste0( rep ( " " , ceiling( width/2) ) , collapse= "") |
14 | 814x |
paste0( left , x ,right , collapse = "") |
15 |
} |
|
16 | ||
17 | ||
18 |
#' recursive_reduce |
|
19 |
#' |
|
20 |
#' Utility function used to replicated purrr::reduce. Recursively applies a |
|
21 |
#' function to a list of elements until only 1 element remains |
|
22 |
#' @param .l list of values to apply a function to |
|
23 |
#' @param .f function to apply to each each element of the list in turn |
|
24 |
#' i.e. .l[[1]] <- .f( .l[[1]] , .l[[2]]) ; .l[[1]] <- .f( .l[[1]] , .l[[3]]) |
|
25 |
recursive_reduce <- function(.l , .f){ |
|
26 | 557x |
if (length(.l) != 1){ |
27 | 377x |
.l[[2]] <- .f( .l[[1]] , .l[[2]]) |
28 | 377x |
return( recursive_reduce( .l[-1] , .f)) |
29 |
} else { |
|
30 | 180x |
return(.l[[1]]) |
31 |
} |
|
32 |
} |
|
33 | ||
34 |
#' invert |
|
35 |
#' |
|
36 |
#' Utility function used to replicated purrr::transpose. Turns a list inside |
|
37 |
#' out. |
|
38 |
#' @param x list |
|
39 |
invert <- function(x){ |
|
40 | 56x |
x2 <- list() |
41 | 56x |
cnames <- names(x) |
42 | 56x |
tnames <- names(x[[1]]) |
43 | 56x |
for ( i in tnames ){ |
44 | 168x |
x2[[i]] <- list() |
45 | 168x |
for (j in cnames){ |
46 | 537x |
x2[[i]][[j]] <- x[[j]][[i]] |
47 |
} |
|
48 |
} |
|
49 | 56x |
return(x2) |
50 |
} |
|
51 | ||
52 | ||
53 | ||
54 | ||
55 |
#' as_ascii_table |
|
56 |
#' |
|
57 |
#' This function takes a data.frame and attempts to convert it into |
|
58 |
#' a simple ascii format suitable for printing to the screen |
|
59 |
#' It is assumed all variable values have a as.character() method |
|
60 |
#' in order to cast them to character. |
|
61 |
#' @param dat Input dataset to convert into a ascii table |
|
62 |
#' @param line_prefix Symbols to prefix infront of every line of the table |
|
63 |
as_ascii_table <- function(dat, line_prefix = " "){ |
|
64 |
|
|
65 |
|
|
66 |
## Convert every value to character and crop to a suitable length |
|
67 | 56x |
dat <- as_tibble(apply(dat, c(1, 2), as_cropped_char)) |
68 |
|
|
69 | 56x |
hold <- list() |
70 | 56x |
COLS <- colnames(dat) |
71 |
|
|
72 |
### For each column extract core elements (width, values , title) and pad out |
|
73 |
### each string to be a suitable length |
|
74 | 56x |
for ( i in 1:ncol(dat)){ |
75 | 179x |
COL <- COLS[i] |
76 | 179x |
VALUES <- dat[[i]] |
77 |
|
|
78 | 179x |
JOINT <- c(COL , VALUES) |
79 | 179x |
WIDTH <- max( sapply(JOINT, nchar)) + 2 |
80 |
|
|
81 | 179x |
hold[[COL]] <- list() |
82 | 179x |
hold[[COL]]$WIDTH <- WIDTH |
83 | 179x |
hold[[COL]]$VALUES <- sapply( VALUES ,string_pad, width = WIDTH ) |
84 | 179x |
hold[[COL]]$HEADER <- sapply( COL ,string_pad, width = WIDTH ) |
85 |
} |
|
86 |
|
|
87 |
### Collapse into a single value per component ( title , values, width ) |
|
88 | 56x |
thold <- invert(hold) |
89 | 56x |
tvals <- recursive_reduce( thold$VALUES , paste0 ) |
90 | 56x |
thead <- recursive_reduce( thold$HEADER , paste0) |
91 | 56x |
twidth <- recursive_reduce( thold$WIDTH , sum) |
92 |
|
|
93 |
### Create header and footer lines |
|
94 | 56x |
TLINE <- paste0(rep("=" , twidth), collapse = "") |
95 | 56x |
LINE <- paste0(rep("-" , twidth), collapse = "") |
96 | 56x |
FVALS <- paste0(line_prefix, tvals , collapse = "\n") |
97 |
|
|
98 |
### Output table |
|
99 | 56x |
paste0( |
100 | 56x |
"\n", |
101 | 56x |
line_prefix, TLINE, "\n", |
102 | 56x |
line_prefix, thead, "\n", |
103 | 56x |
line_prefix, LINE, "\n", |
104 | 56x |
FVALS, "\n", |
105 | 56x |
line_prefix, LINE |
106 |
) |
|
107 |
} |
|
108 | ||
109 | ||
110 | ||
111 | ||
112 | ||
113 | ||
114 | ||
115 | ||
116 |
#' as_cropped_char |
|
117 |
#' |
|
118 |
#' Makes any character string above x chars |
|
119 |
#' Reduce down to a x char string with ... |
|
120 |
#' @param inval a single element value |
|
121 |
#' @param crop_at character limit |
|
122 |
as_cropped_char <- function(inval, crop_at = 30 ){ |
|
123 |
|
|
124 | 635x |
if ( is.null(inval) ){ |
125 |
|
|
126 | ! |
inval <- "<NULL>" |
127 |
|
|
128 | 635x |
} else if ( is.na(inval)){ |
129 |
|
|
130 | 1x |
inval <- "<NA>" |
131 |
|
|
132 |
} else { |
|
133 |
|
|
134 | 634x |
inval <- as.character(inval) |
135 |
|
|
136 |
} |
|
137 |
|
|
138 | 635x |
charlength <- sapply(inval, nchar) |
139 |
|
|
140 | 635x |
if (charlength > crop_at ){ |
141 |
|
|
142 | 10x |
outval <- substr(inval, 1, crop_at ) |
143 | 10x |
outval <- paste0(outval, '...') |
144 |
|
|
145 |
} else { |
|
146 |
|
|
147 | 625x |
outval <- inval |
148 |
|
|
149 |
} |
|
150 |
|
|
151 | 635x |
outval |
152 |
} |
|
153 | ||
154 | ||
155 | ||
156 |
#' get_table |
|
157 |
#' |
|
158 |
#' Generate nice looking table from a data frame |
|
159 |
#' @param dsin dataset |
|
160 |
#' @param row_limit Maximum number of rows displayed in dataset |
|
161 |
get_table <- function(dsin , row_limit = 10){ |
|
162 |
|
|
163 | 56x |
if( nrow(dsin) == 0 ) { |
164 | ! |
return("") |
165 |
} |
|
166 |
|
|
167 | 56x |
display_table <- subset(dsin , 1:nrow(dsin) < (row_limit + 1) ) |
168 |
|
|
169 | 56x |
if ( nrow(dsin) > row_limit ){ |
170 |
|
|
171 | 4x |
add_message <- paste0( |
172 | 4x |
'First ', |
173 | 4x |
row_limit, |
174 | 4x |
" of " , |
175 | 4x |
nrow(dsin), |
176 | 4x |
' rows are shown in table below' |
177 |
) |
|
178 |
|
|
179 |
} else { |
|
180 | 52x |
add_message <- 'All rows are shown in table below' |
181 |
} |
|
182 |
|
|
183 | 56x |
msg <- paste( |
184 | 56x |
c( |
185 | 56x |
add_message, |
186 | 56x |
as_ascii_table(display_table), |
187 | 56x |
'\n' |
188 |
), |
|
189 | 56x |
collapse = '\n' |
190 |
) |
|
191 |
|
|
192 | 56x |
return(msg) |
193 |
} |
|
194 | ||
195 |
1 | ||
2 |
#' Print diffdf objects |
|
3 |
#' |
|
4 |
#' Print nicely formatted version of an diffdf object |
|
5 |
#' @param x comparison object created by diffdf(). |
|
6 |
#' @param ... Additional arguments (not used) |
|
7 |
#' @param as_string Return printed message as an R character vector? |
|
8 |
#' @examples |
|
9 |
#' x <- subset( iris , -Species ) |
|
10 |
#' x[1,2] <- 5 |
|
11 |
#' COMPARE <- diffdf( iris, x) |
|
12 |
#' print( COMPARE ) |
|
13 |
#' print( COMPARE , "Sepal.Length" ) |
|
14 |
#' @export |
|
15 |
print.diffdf <- function(x, ..., as_string = FALSE){ |
|
16 | 21x |
COMPARE <- x |
17 | ||
18 | 21x |
if ( length(COMPARE) == 0 ){ |
19 | 2x |
outtext <- "No issues were found!\n" |
20 |
|
|
21 |
} else { |
|
22 |
|
|
23 | 19x |
start_text <- paste0( |
24 | 19x |
'Differences found between the objects!\n\n', |
25 | 19x |
'A summary is given below.\n\n' |
26 |
) |
|
27 | ||
28 | 19x |
end_text <- lapply(COMPARE, function(x) get_print_message(x) ) |
29 | 19x |
end_text <- paste0(unlist(end_text), collapse = "") |
30 | ||
31 | 19x |
outtext <- paste0(start_text, end_text) |
32 |
} |
|
33 |
|
|
34 | 21x |
if ( as_string){ |
35 | 21x |
return(strsplit(outtext, '\n')[[1]]) |
36 |
} else { |
|
37 | ! |
cat(outtext) |
38 | ! |
return(invisible(COMPARE)) |
39 |
} |
|
40 |
} |
|
41 | ||
42 | ||
43 |
1 |
#' construct_issue |
|
2 |
#' |
|
3 |
#' Make an s3 object with class issue and possible additional class, |
|
4 |
#' and assign other arguments to attributes |
|
5 |
#' @param value the value of the object |
|
6 |
#' @param message the value of the message attribute |
|
7 |
#' @param add_class additional class to add |
|
8 |
construct_issue <- function(value, message, add_class = NULL){ |
|
9 | 2280x |
x <- value |
10 |
|
|
11 |
### If nothing has been provided return nothing ! |
|
12 | 2074x |
if ( nrow(x) == 0 ) return(NULL) |
13 |
|
|
14 | 206x |
class(x) <- c(add_class, "issue", class(x)) |
15 | 206x |
attributes(x)[["message"]] <- message |
16 | 206x |
return(x) |
17 |
} |
|
18 | ||
19 | ||
20 |
#' get_issue_message |
|
21 |
#' |
|
22 |
#' Simple function to grab the issue message |
|
23 |
#' @param object inputted object of class issue |
|
24 |
#' @param ... other arguments |
|
25 |
get_issue_message <- function (object, ...) { |
|
26 | 206x |
return( attr(object,"message")) |
27 |
} |
|
28 | ||
29 | ||
30 |
#' get_print_message |
|
31 |
#' |
|
32 |
#' Get the required text depending on type of issue |
|
33 |
#' @param object inputted object of class issue |
|
34 |
#' @param ... other arguments |
|
35 |
get_print_message <- function (object, ...) { |
|
36 | 56x |
UseMethod("get_print_message", object) |
37 |
} |
|
38 | ||
39 | ||
40 |
#' get_print_message.default |
|
41 |
#' |
|
42 |
#' Errors, as this should only ever be given an issue |
|
43 |
#' @param object issue |
|
44 |
get_print_message.default <- function(object) { |
|
45 | ! |
stop("Error: An issue has not been provided to this function!") |
46 |
} |
|
47 | ||
48 | ||
49 |
#' get_print_message.issue |
|
50 |
#' |
|
51 |
#' Get text from a basic issue, based on the class of the value of the issue |
|
52 |
#' |
|
53 |
#' @param object an object of class issue_basic |
|
54 |
get_print_message.issue <- function(object){ |
|
55 | 56x |
paste( |
56 | 56x |
c(attr(object, "message"), get_table(object) ), |
57 | 56x |
collapse = '\n' |
58 |
) |
|
59 |
} |
|
60 | ||
61 | ||
62 | ||
63 | ||
64 | ||
65 | ||
66 | ||
67 | ||
68 | ||
69 |
1 |
#' is_variable_different |
|
2 |
#' |
|
3 |
#' This subsets the data set on the variable name, picks out differences and returns a tibble |
|
4 |
#' of differences for the given variable |
|
5 |
#' @importFrom tibble as_tibble |
|
6 |
#' @param variablename name of variable being compared |
|
7 |
#' @param keynames name of keys |
|
8 |
#' @param datain Inputted dataset with base and compare vectors |
|
9 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
10 |
#' @return A boolean vector which is T if target and current are different |
|
11 |
is_variable_different <- function (variablename, keynames, datain, ...) { |
|
12 | ||
13 | 991x |
xvar <- paste0(variablename,'.x') |
14 | 991x |
yvar <- paste0(variablename,'.y') |
15 |
|
|
16 | 991x |
if ( ! xvar %in% names(datain) | ! yvar %in% names(datain)){ |
17 | ! |
stop("Variable does not exist within input dataset") |
18 |
} |
|
19 |
|
|
20 | 991x |
target <- datain[[xvar]] |
21 | 991x |
current <- datain[[yvar]] |
22 | 991x |
outvect <- find_difference(target, current, ...) |
23 |
|
|
24 | 991x |
datain[["VARIABLE"]] <- variablename |
25 |
|
|
26 | 991x |
names(datain)[names(datain) %in% c(xvar, yvar)] <- c("BASE", "COMPARE") |
27 |
|
|
28 | 991x |
x <- as_tibble( |
29 | 991x |
subset( |
30 | 991x |
datain, |
31 | 991x |
outvect, |
32 | 991x |
select = c("VARIABLE", keynames, "BASE", "COMPARE") |
33 |
) |
|
34 |
) |
|
35 |
|
|
36 | 991x |
return(x) |
37 |
} |
|
38 | ||
39 |
#' compare_vectors |
|
40 |
#' |
|
41 |
#' Compare two vectors looking for differences |
|
42 |
#' |
|
43 |
#' @param target the base vector |
|
44 |
#' @param current a vector to compare target to |
|
45 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
46 |
compare_vectors <- function (target, current, ...) { |
|
47 | 1020x |
UseMethod("compare_vectors") |
48 |
} |
|
49 | ||
50 | ||
51 |
#' find_difference |
|
52 |
#' |
|
53 |
#' This determines if two vectors are different. It expects vectors of the same |
|
54 |
#' length and type, and is intended to be used after checks have already been done |
|
55 |
#' Initially picks out any nas (matching nas count as a match) |
|
56 |
#' Then compares remaining vector |
|
57 |
#' |
|
58 |
#' @param target the base vector |
|
59 |
#' @param current a vector to compare target to |
|
60 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
61 |
find_difference <- function (target, current, ...) { |
|
62 |
|
|
63 | 1023x |
if( length(target) != length(current)){ |
64 | 2x |
warning("Inputs are not of the same length") |
65 | 2x |
return(NULL) |
66 |
} |
|
67 |
|
|
68 | 1021x |
if( is.null(target) | is.null(current) ){ |
69 | 1x |
return( is.null(target) != is.null(current) ) |
70 |
} |
|
71 |
|
|
72 |
### Initalise output, assume problem unless evidence otherwise |
|
73 | 1020x |
return_vector <- rep(TRUE, length(target)) |
74 |
|
|
75 | 1020x |
nas_t <- is.na(target) |
76 | 1020x |
nas_c <- is.na(current) |
77 |
|
|
78 |
## compare missing values |
|
79 | 1020x |
nacompare <- nas_t != nas_c |
80 | 1020x |
naselect <- nas_t|nas_c |
81 | 1020x |
return_vector[naselect] <- nacompare[naselect] |
82 |
|
|
83 |
## compare non-missing values |
|
84 | 1020x |
selectvector <- as.logical( (!nas_t) * (!nas_c) ) |
85 |
|
|
86 | 1020x |
comparevect <- compare_vectors( |
87 | 1020x |
target[selectvector] , |
88 | 1020x |
current[selectvector], |
89 |
... |
|
90 |
) |
|
91 |
|
|
92 | 1020x |
return_vector[selectvector] <- comparevect |
93 |
|
|
94 | 1020x |
return(return_vector) |
95 |
} |
|
96 | ||
97 | ||
98 | ||
99 | ||
100 | ||
101 | ||
102 | ||
103 |
#' compare_vectors.default |
|
104 |
#' |
|
105 |
#' Default method, if the vector is not numeric or factor. Basic comparison |
|
106 |
#' @param target the base vector |
|
107 |
#' @param current a vector to compare target to |
|
108 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
109 |
compare_vectors.default <- function(target, current, ...){ |
|
110 | 445x |
target != current |
111 |
} |
|
112 | ||
113 | ||
114 | ||
115 | ||
116 |
#' compare_vectors.factor |
|
117 |
#' |
|
118 |
#' Compares factors. Sets them as character and then compares |
|
119 |
#' @param target the base vector |
|
120 |
#' @param current a vector to compare target to |
|
121 |
#' @param ... Additional arguments which might be passed through (numerical accuracy) |
|
122 |
compare_vectors.factor <- function(target, current, ...){ |
|
123 | 90x |
as.character(target) != as.character(current) |
124 |
} |
|
125 | ||
126 | ||
127 | ||
128 | ||
129 | ||
130 |
#' compare_vectors.numeric |
|
131 |
#' |
|
132 |
#' This is a modified version of the all.equal function |
|
133 |
#' which returns a vector rather than a message |
|
134 |
#' @param target the base vector |
|
135 |
#' @param current a vector to compare target to |
|
136 |
#' @param tolerance Level of tolerance for differences between two variables |
|
137 |
#' @param scale Scale that tolerance should be set on. If NULL assume absolute |
|
138 |
compare_vectors.numeric <- function( |
|
139 |
target, |
|
140 |
current, |
|
141 |
tolerance = sqrt(.Machine$double.eps), |
|
142 |
scale = NULL |
|
143 |
){ |
|
144 | ||
145 | 485x |
out <- target == current |
146 |
|
|
147 | 485x |
if (all(out)) { |
148 | 451x |
return(!out) |
149 |
} |
|
150 |
|
|
151 | 34x |
if (is.integer(target) || is.integer(current)){ |
152 | 11x |
target <- as.double(target) |
153 | 11x |
current <- as.double(current) |
154 |
} |
|
155 |
|
|
156 | 34x |
xy <- abs(target - current) |
157 |
|
|
158 | 34x |
if (!is.null(scale)) { |
159 | 2x |
xy <- xy/scale |
160 |
} |
|
161 |
|
|
162 | 34x |
return(xy > tolerance) |
163 | ||
164 |
} |
|
165 | ||
166 | ||
167 | ||
168 |
1 |
#'generate_keyname |
|
2 |
#' |
|
3 |
#'Function to generate a name for the keys if not provided |
|
4 |
#' |
|
5 |
#'@param BASE base dataset |
|
6 |
#'@param COMP comparison dataset |
|
7 |
#'@param replace_names a vector of replacement names. Used for recursion, should be edited in function for clarity |
|
8 |
#' |
|
9 |
generate_keyname <- function( |
|
10 |
BASE, |
|
11 |
COMP, |
|
12 |
replace_names = c("..ROWNUMBER..", "..RN..", "..ROWN..", "..N..") |
|
13 |
){ |
|
14 |
|
|
15 | 213x |
if ( class(replace_names) != "character"){ |
16 | ! |
stop( "replace_names is not a character vector") |
17 |
} |
|
18 |
|
|
19 | 213x |
if (length(replace_names) == 0) { |
20 | 8x |
stop("All default row names are in use in BASE/COMPARE. Please provide a KEY argument") |
21 |
} |
|
22 |
|
|
23 | 205x |
key_name <- replace_names[1] |
24 |
|
|
25 | 205x |
if (!is.null(BASE[[key_name]]) | !is.null( COMP[[key_name]])){ |
26 | 66x |
key_name <- generate_keyname(BASE, COMP, replace_names[-1]) |
27 |
} |
|
28 |
|
|
29 | 173x |
return(key_name) |
30 |
|
|
31 |
} |
1 | ||
2 |
#' sort_then_join |
|
3 |
#' |
|
4 |
#' Convenience function to sort two strings and paste them together |
|
5 |
#' @param string1 first string |
|
6 |
#' @param string2 second string |
|
7 |
sort_then_join <- function(string1, string2){ |
|
8 | 14x |
paste0(sort(c(string1, string2)), collapse = '') |
9 |
} |
|
10 | ||
11 | ||
12 |
#' class_merge |
|
13 |
#' |
|
14 |
#' Convenience function to put all classes an object has into one string |
|
15 |
#' @param x an object |
|
16 |
class_merge <- function(x){ |
|
17 | 304x |
paste(class(x), collapse = "_") |
18 |
} |
|
19 | ||
20 | ||
21 |
get_message <- function( colname , whichdat, totype){ |
|
22 | 12x |
message(paste0( |
23 | 12x |
"NOTE: Variable " , colname, " in " , tolower(whichdat) , " was casted to " , totype |
24 |
)) |
|
25 |
} |
|
26 | ||
27 | ||
28 |
#' get_casted_vector |
|
29 |
#' |
|
30 |
#' casts a vector depending on its type and input |
|
31 |
#' @param colin column to cast |
|
32 |
#' @param colname name of vector |
|
33 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
34 |
get_casted_vector <- function(colin, colname, whichdat){ |
|
35 |
|
|
36 | 24x |
if ( class(colin) == "factor"){ |
37 | 6x |
get_message( colname , whichdat , "character") |
38 | 6x |
return(as.character(colin)) |
39 |
} |
|
40 |
|
|
41 | 18x |
if ( class(colin) == "integer"){ |
42 | 6x |
get_message( colname , whichdat , "numeric") |
43 | 6x |
return(as.numeric(colin)) |
44 |
} |
|
45 |
|
|
46 | 12x |
colin |
47 |
} |
|
48 | ||
49 | ||
50 | ||
51 |
#' get_casted_dataset |
|
52 |
#' |
|
53 |
#' Internal utility function to loop across a dataset casting all target |
|
54 |
#' variables |
|
55 |
#' @param df dataset to be casted |
|
56 |
#' @param columns columns to be casted |
|
57 |
#' @param whichdat whether base or compare is being casted (used for messages) |
|
58 |
get_casted_dataset <- function(df , columns , whichdat){ |
|
59 | 22x |
for ( col in columns ){ |
60 | 24x |
df[[col]] <- get_casted_vector( df[[col]] , col , whichdat ) |
61 |
} |
|
62 | 22x |
return(df) |
63 |
} |
|
64 | ||
65 | ||
66 | ||
67 | ||
68 | ||
69 | ||
70 |
#' cast_variables |
|
71 |
#' |
|
72 |
#' Function to cast datasets columns if they have differing types |
|
73 |
#' Restricted to specific cases, currently integer and double, and character and factor |
|
74 |
#' @param BASE base dataset |
|
75 |
#' @param COMPARE comparison dataset |
|
76 |
#' @param ignore_vars Variables not to be considered for casting |
|
77 |
#' @param cast_integers Logical - Whether integers should be cased to double when compared to doubles |
|
78 |
#' @param cast_factors Logical - Whether characters should be casted to characters when compared to characters |
|
79 |
cast_variables <- function( |
|
80 |
BASE, |
|
81 |
COMPARE, |
|
82 |
ignore_vars = NULL, |
|
83 |
cast_integers = FALSE , |
|
84 |
cast_factors = FALSE |
|
85 |
){ |
|
86 | ||
87 | 13x |
allowed_class_casts <- c( "integernumeric" , "characterfactor")[c(cast_integers, cast_factors)] |
88 |
|
|
89 | 13x |
BASE_class <- data.frame( |
90 | 13x |
class_BASE = sapply(BASE, class_merge), |
91 | 13x |
colname = names(BASE), |
92 | 13x |
stringsAsFactors = FALSE |
93 |
) |
|
94 | 13x |
BASE_class <- BASE_class[!BASE_class[["colname"]] %in% ignore_vars,, drop=FALSE] |
95 |
|
|
96 |
|
|
97 | 13x |
COMPARE_class <- data.frame( |
98 | 13x |
class_COMPARE = sapply(COMPARE, class_merge), |
99 | 13x |
colname = names(COMPARE), |
100 | 13x |
stringsAsFactors = FALSE |
101 |
) |
|
102 | 13x |
COMPARE_class <- COMPARE_class[!COMPARE_class[["colname"]] %in% ignore_vars ,, drop=FALSE] |
103 |
|
|
104 | 13x |
common_class <- merge( |
105 | 13x |
x = BASE_class, |
106 | 13x |
y = COMPARE_class, |
107 | 13x |
by = "colname" |
108 |
) |
|
109 |
|
|
110 |
|
|
111 | 13x |
diff_class <- common_class[ common_class[["class_BASE"]] != common_class[["class_COMPARE"]] ,,drop=FALSE] |
112 | ||
113 |
|
|
114 | 13x |
diff_class$classmerge <- mapply( |
115 | 13x |
sort_then_join, |
116 | 13x |
diff_class$class_COMPARE, |
117 | 13x |
diff_class$class_BASE |
118 |
) |
|
119 |
|
|
120 |
|
|
121 | 13x |
cast_columns <- diff_class[ diff_class[["classmerge"]] %in% allowed_class_casts ,,drop=FALSE] |
122 |
|
|
123 |
|
|
124 | 13x |
DATASETS <- list( |
125 | 13x |
"BASE" = BASE, |
126 | 13x |
"COMPARE" = COMPARE |
127 |
) |
|
128 |
|
|
129 | ||
130 | 13x |
if( nrow(cast_columns) == 0 ){ |
131 | 2x |
return( DATASETS ) |
132 |
} |
|
133 |
|
|
134 |
|
|
135 | 11x |
for ( i in names(DATASETS)){ |
136 | 22x |
DATASETS[[i]] <- get_casted_dataset( DATASETS[[i]] , cast_columns$colname , i) |
137 |
} |
|
138 |
|
|
139 |
|
|
140 | 11x |
return(DATASETS) |
141 |
} |
|
142 | ||
143 | ||
144 | ||
145 | ||
146 | ||
147 | ||
148 | ||
149 | ||
150 | ||
151 | ||
152 | ||
153 | ||
154 | ||
155 | ||
156 | ||
157 | ||
158 | ||
159 | ||
160 | ||
161 | ||
162 | ||
163 | ||
164 | ||
165 | ||
166 | ||
167 | ||
168 | ||
169 | ||
170 | ||
171 | ||
172 | ||
173 |
1 | ||
2 | ||
3 | ||
4 |
#' factor_to_character |
|
5 |
#' |
|
6 |
#' Takes a dataframe and converts any factor variables to character |
|
7 |
#' @param dsin input dataframe |
|
8 |
#' @param vars variables to consider for conversion. Default NULL will consider |
|
9 |
#' every variable within the dataset |
|
10 |
factor_to_character <- function( dsin , vars = NULL){ |
|
11 |
|
|
12 | ! |
if ( is.null(vars) ) vars = names(dsin) |
13 |
|
|
14 | 258x |
for (var in vars){ |
15 | 272x |
if( is.factor(dsin[[var]])){ |
16 | ! |
dsin[[var]] <- as.character(dsin[[var]]) |
17 |
} |
|
18 |
} |
|
19 | 258x |
return(dsin) |
20 |
} |
|
21 | ||
22 | ||
23 | ||
24 | ||
25 |
#' has_unique_rows |
|
26 |
#' |
|
27 |
#' Check if a data sets rows are unique |
|
28 |
#' @param DAT input data set (data frame) |
|
29 |
#' @param KEYS Set of keys which should be unique |
|
30 |
has_unique_rows <- function(DAT , KEYS){ |
|
31 | 259x |
DUPS <- duplicated( subset(DAT , select= KEYS) ) |
32 | 259x |
NDUPS <- sum( DUPS) |
33 | 259x |
return( NDUPS == 0 ) |
34 |
} |
|
35 | ||
36 |
#'convert_to_issue |
|
37 |
#' |
|
38 |
#'converts the count value into the correct issue format |
|
39 |
#'@param datin data inputted |
|
40 |
#'@importFrom tibble rownames_to_column |
|
41 |
convert_to_issue <- function(datin){ |
|
42 | 128x |
datin_tibble <- tibble( |
43 | 128x |
`Variable` = names(datin), |
44 | 128x |
`No of Differences` = datin |
45 |
) |
|
46 |
|
|
47 | 128x |
datin_tibble_reduced <- datin_tibble[ datin_tibble[["No of Differences"]] > 0, , drop = FALSE] |
48 | 128x |
return(datin_tibble_reduced) |
49 |
} |
|
50 |