diff --git a/R/bmerge.R b/R/bmerge.R index f7eafd047..7505f1526 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -35,102 +35,99 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } if (nrow(i)) { - x_merge_types = vapply_1c(x[0L, ..xcols], getClass) - i_merge_types = vapply_1c(x[0L, ..icols], getClass) - xnames = paste0("x.", names(x)[xcols]) - inames = paste0("i.", names(i)[icols]) - for (a in seq_along(icols)) { - # - check that join columns have compatible types - # - do type coercions if necessary on just the shallow local copies for the purpose of join - # - handle factor columns appropriately - # Note that if i is keyed, if this coerces i's key gets dropped by set() - ic = icols[a] - xc = xcols[a] - x_merge_type = x_merge_types[a] - i_merge_type = i_merge_types[a] - xname = xnames[a] - iname = inames[a] - if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type) - if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type) - if (x_merge_type=="factor" || i_merge_type=="factor") { - if (roll!=0.0 && a==length(icols)) - stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname) - if (x_merge_type=="factor" && i_merge_type=="factor") { - if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) - set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values - next - } else { - if (x_merge_type=="character") { - if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname) - set(i, j=ic, value=val<-as.character(i[[ic]])) - set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 - next - } else if (i_merge_type=="character") { - if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) - newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L) - if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 - set(i, j=ic, value=newvalue) - next - } - } - stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type) - } - if (x_merge_type == i_merge_type) { - if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname) + xhead = x[0, ..xcols] + ihead = i[0, ..icols] + xtypes = vapply_1c(xhead, getClass) + itypes = vapply_1c(ihead, getClass) + for (a in seq_along(icols)) { + # - check that join columns have compatible types + # - do type coercions if necessary on just the shallow local copies for the purpose of join + # - handle factor columns appropriately + # Note that if i is keyed, if this coerces i's key gets dropped by set() + ic = icols[a] + xc = xcols[a] + xtype = xtypes[a] + itype = itypes[a] + xname = paste0("x.", names(xhead)[a]) + iname = paste0("i.", names(ihead)[a]) + if (!xtype %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, xtype) + if (!itype %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, itype) + if (xtype=="factor" || itype=="factor") { + if (roll!=0.0 && a==length(icols)) + stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname) + if (xtype=="factor" && itype=="factor") { + if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) + set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values + next + } else { + if (xtype=="character") { + if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname) + set(i, j=ic, value=val<-as.character(i[[ic]])) + set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 + next + } else if (itype=="character") { + if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) + newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L) + if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 + set(i, j=ic, value=newvalue) next } - if (x_merge_type=="character" || i_merge_type=="character" || - x_merge_type=="logical" || i_merge_type=="logical" || - x_merge_type=="factor" || i_merge_type=="factor") { - if (anyNA(i[[ic]]) && allNA(i[[ic]])) { - if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, i_merge_type, x_merge_type, xname) - set(i, j=ic, value=match.fun(paste0("as.", x_merge_type))(i[[ic]])) - next - } - else if (anyNA(x[[xc]]) && allNA(x[[xc]])) { - if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, x_merge_type, i_merge_type, iname) - set(x, j=xc, value=match.fun(paste0("as.", i_merge_type))(x[[xc]])) - next - } - stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type) - } - if (x_merge_type=="integer64" || i_merge_type=="integer64") { - nm = c(iname, xname) - if (x_merge_type=="integer64") { w=i; wc=ic; wclass=i_merge_type; } else { w=x; wc=xc; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce - if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) { - if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L]) - set(w, j=wc, value=bit64::as.integer64(w[[wc]])) - } else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L]) + } + stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, xtype, iname, itype) + } + if (xtype == itype) { + if (anyDuplicated(icols) && !all() && duplicated(icols, fromLast=TRUE)[a]) { + set(x, j=xc, value=as.double(x[[xc]])) + set(i, j=ic, value=as.double(i[[ic]])) + if (verbose) catf("%s and %s are both Dates. R does not guarentee a type for Date internally, hence, coercing to double.\n", iname, xname) + } else { + if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xtype, xname) + } + next + } + if (xtype=="character" || itype=="character" || + xtype=="logical" || itype=="logical" || + xtype=="factor" || itype=="factor") { + if (anyNA(i[[ic]]) && allNA(i[[ic]])) { + if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, itype, xtype, xname) + set(i, j=ic, value=match.fun(paste0("as.", xtype))(i[[ic]])) + next + } + else if (anyNA(x[[xc]]) && allNA(x[[xc]])) { + if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, xtype, itype, iname) + set(x, j=xc, value=match.fun(paste0("as.", itype))(x[[xc]])) + next + } + stopf("Incompatible join types: %s (%s) and %s (%s)", xname, xtype, iname, itype) + } + if (xtype=="integer64" || itype=="integer64") { + nm = c(iname, xname) + if (xtype=="integer64") { w=i; wc=ic; wclass=itype; } else { w=x; wc=xc; wclass=xtype; nm=rev(nm) } # w is which to coerce + if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) { + if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L]) + set(w, j=wc, value=bit64::as.integer64(w[[wc]])) + } else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L]) + } else { + # just integer and double left + if (itype=="double") { + if (!isReallyReal(i[[ic]])) { + # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys + # we've always coerced to int and returned int, for convenience. + if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname) + val = as.integer(i[[ic]]) + if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679 + set(i, j=ic, value=val) + set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too. } else { - # just integer and double left - if (i_merge_type=="double") { - if (!isReallyReal(i[[ic]])) { - # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys - # we've always coerced to int and returned int, for convenience. - if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname) - val = as.integer(i[[ic]]) - if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679 - set(i, j=ic, value=val) - set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too. - } else { - if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname) - set(x, j=xc, value=as.double(x[[xc]])) - } - } else { - if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname) - set(i, j=ic, value=as.double(i[[ic]])) - ic_idx = which(ic == icols) - if (length(ic_idx)>1) { - for (b in which(x_merge_types[ic_idx] != "double")) { - xb = xcols[b] - if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", xnames[b], xname) - set(x, j=xb, value=as.double(x[[xb]])) - } - } - } + if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname) + set(x, j=xc, value=as.double(x[[xc]])) } + } else { + if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname) + set(i, j=ic, value=as.double(i[[ic]])) } } +}} ## after all modifications of x, check if x has a proper key on all xcols. ## If not, calculate the order. Also for non-equi joins, the order must be calculated.