From 2649ed910e3e85e4aa2852cffb9be741cfcbbdd7 Mon Sep 17 00:00:00 2001 From: Ayush Singh Date: Tue, 30 Dec 2025 10:33:39 +0000 Subject: [PATCH] Refactor colnamesInt to provide context-specific error messages #5039 --- R/data.table.R | 17 +++++++++-------- R/duplicated.R | 2 +- R/frank.R | 2 +- R/mergelist.R | 12 ++++++------ R/setkey.R | 2 +- R/setops.R | 4 ++-- R/wrappers.R | 2 +- inst/tests/tests.Rraw | 10 +++++++--- src/data.table.h | 2 +- src/init.c | 2 +- src/nafill.c | 7 ++++++- src/utils.c | 19 ++++++++++++------- 12 files changed, 48 insertions(+), 33 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 27c985e44c..ca6c69de48 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -495,8 +495,9 @@ replace_dot_alias = function(e) { allow.cartesian = TRUE } # TODO: collect all '==' ops first to speeden up Cnestedid - rightcols = colnamesInt(x, names(on), check_dups=FALSE) - leftcols = colnamesInt(i, unname(on), check_dups=FALSE) + rightcols = colnamesInt(x, names(on), check_dups=FALSE, skip_absent=FALSE, context="x table's columns in on= join") + leftcols = colnamesInt(i, unname(on), check_dups=FALSE, + skip_absent=FALSE, context="i table's columns in on= join") } else { ## missing on rightcols = chmatch(key(x), names_x) # NAs here (i.e. invalid data.table) checked in bmerge() @@ -2490,7 +2491,7 @@ na.omit.data.table = function(object, cols = seq_along(object), invert = FALSE, if (!cedta()) return(NextMethod()) # nocov if ( !missing(invert) && is.na(as.logical(invert)) ) stopf("Argument 'invert' must be logical TRUE/FALSE") - cols = colnamesInt(object, cols, check_dups=FALSE) + cols = colnamesInt(object, cols, check_dups=FALSE, skip_absent=FALSE, context="setnames") ix = .Call(Cdt_na, object, cols) # forgot about invert with no NA case, #2660 if (invert) { @@ -2653,7 +2654,7 @@ copy = function(x) { .shallow = function(x, cols = NULL, retain.key = FALSE, unlock = FALSE) { wasnull = is.null(cols) - cols = colnamesInt(x, cols, check_dups=FALSE) + cols = colnamesInt(x, cols, check_dups=FALSE, skip_absent=FALSE, context="column selection") ans = .Call(Cshallowwrapper, x, cols) # copies VECSXP only if(retain.key){ @@ -2840,12 +2841,12 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent= stopf("Provide either before= or after= but not both") if (length(before)>1L || length(after)>1L) stopf("before=/after= accept a single column name or number, not more than one") - neworder = colnamesInt(x, neworder, check_dups=FALSE, skip_absent=skip_absent) # dups are now checked inside Csetcolorder below + neworder = colnamesInt(x, neworder, check_dups=FALSE, skip_absent=skip_absent, context="setcolorder") # dups are now checked inside Csetcolorder below neworder = neworder[neworder != 0L] # tests 498.11, 498.13 fail w/o this if (length(before)) - neworder = c(setdiff(seq_len(colnamesInt(x, before) - 1L), neworder), neworder) + neworder = c(setdiff(seq_len(colnamesInt(x, before, check_dups=FALSE, skip_absent=FALSE, context="setcolorder 'before'") - 1L), neworder), neworder) if (length(after)) - neworder = c(setdiff(seq_len(colnamesInt(x, after)), neworder), neworder) + neworder = c(setdiff(seq_len(colnamesInt(x, after, check_dups=FALSE, skip_absent=FALSE, context="setcolorder 'after'")), neworder), neworder) if (length(neworder) != length(x)) { # pad by the missing elements (checks inside Csetcolorder catch other mistakes) neworder = c(neworder, setdiff(seq_along(x), neworder)) @@ -3144,7 +3145,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { } else if (!length(cols)) { stopf("x is a list, 'cols' cannot be 0-length.") } - cols = colnamesInt(x, cols, check_dups=FALSE) + cols = colnamesInt(x, cols, check_dups=FALSE, skip_absent=FALSE, context="column deletion") ids = .Call(Crleid, x, cols) if (!is.null(prefix)) ids = paste0(prefix, ids) ids diff --git a/R/duplicated.R b/R/duplicated.R index e1a04c9822..106740af58 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -74,7 +74,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon ## unique.data.table has been refactored to simply call duplicated.data.table ## making the refactor unnecessary, but let's leave it here just in case .duplicated.helper = function(x, by) { - cols = colnamesInt(x, by, check_dups=FALSE) + cols = colnamesInt(x, by, check_dups=FALSE, skip_absent=FALSE, context="duplicated/unique") use.keyprefix = if (is.null(by)) FALSE else { haskey(x) && length(by) <= length(key(x)) && diff --git a/R/frank.R b/R/frank.R index 419f5ea414..acff6104b4 100644 --- a/R/frank.R +++ b/R/frank.R @@ -18,7 +18,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a cols = 1L x = as_list(x) } else { - cols = colnamesInt(x, cols, check_dups=TRUE) + cols = colnamesInt(x, cols, check_dups=TRUE, skip_absent=FALSE, context="frank") if (!length(cols)) stopf("x is a list, 'cols' can not be 0-length") } diff --git a/R/mergelist.R b/R/mergelist.R index 7ff0e87bcf..97ec8fe8cb 100644 --- a/R/mergelist.R +++ b/R/mergelist.R @@ -26,9 +26,9 @@ onkeys = function(x, y) { # column index selection helper someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) { - keep = colnamesInt(x, keep) - drop = colnamesInt(x, drop) - cols = colnamesInt(x, cols) + keep = colnamesInt(x, keep, check_dups=FALSE, skip_absent=FALSE, context="merge(keep=)") + drop = colnamesInt(x, drop, check_dups=FALSE, skip_absent=FALSE, context="merge(drop=)") + cols = colnamesInt(x, cols, check_dups=FALSE, skip_absent=FALSE, context="merge(cols=)") ans = union(keep, setdiff(cols, drop)) if (!retain.order) return(ans) sort(ans) @@ -104,8 +104,8 @@ dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) { stopf("'on' must be non-zero length character vector") if (mult == "all" && (how == "semi" || how == "anti")) stopf("semi and anti joins must be used with mult!='all'") - icols = colnamesInt(i, on, check_dups=TRUE) - xcols = colnamesInt(x, on, check_dups=TRUE) + icols = colnamesInt(i, on, check_dups=TRUE, skip_absent=FALSE, context="merge 'i' table columns") + xcols = colnamesInt(x, on, check_dups=TRUE, skip_absent=FALSE, context="merge 'x' table columns") ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose) if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line return(invisible(NULL)) @@ -338,7 +338,7 @@ mergelist_impl_ = function(l, on, cols, how, mult, join.many, copy) { } out.mem = vapply_1c(out, address) if (copy) - .Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))])) + .Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))], check_dups=FALSE, skip_absent=FALSE, context="merge column copy")) if (verbose) catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]] - p) out diff --git a/R/setkey.R b/R/setkey.R index 4ba5be4d71..11ee294cc8 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -149,7 +149,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, by = NULL } else { if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L)) - by = colnamesInt(x, by, check_dups=FALSE) + by = colnamesInt(x, by, check_dups=FALSE, skip_absent=FALSE, context="setkey") } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE diff --git a/R/setops.R b/R/setops.R index 4ae78b193d..a3765b25ab 100644 --- a/R/setops.R +++ b/R/setops.R @@ -3,9 +3,9 @@ setdiff_ = function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE) if (!is.data.table(x) || !is.data.table(y)) stopf("x and y must both be data.tables") # !ncol redundant since all 0-column data.tables have 0 rows if (!nrow(x)) return(x) - by.x = colnamesInt(x, by.x, check_dups=TRUE) + by.x = colnamesInt(x, by.x, check_dups=TRUE, skip_absent=FALSE, context="set operation 'x' columns") if (!nrow(y)) return(unique(x, by=by.x)) - by.y = colnamesInt(y, by.y, check_dups=TRUE) + by.y = colnamesInt(y, by.y, check_dups=TRUE, skip_absent=FALSE, context="set operation 'y' columns") if (length(by.x) != length(by.y)) stopf("length(by.x) != length(by.y)", domain=NA) # factor in x should've factor/character in y, and vice-versa for (a in seq_along(by.x)) { diff --git a/R/wrappers.R b/R/wrappers.R index c19c1b1eea..89160c2f2d 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -13,7 +13,7 @@ fcase = function(..., default=NA) { .Call(CfcaseR, parent.frame(), arg_list) } -colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(CcolnamesInt, x, cols, check_dups, skip_absent) +colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE, context=NULL) .Call(CcolnamesInt, x, cols, check_dups, skip_absent, context) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8cad916e3b..f81f411d45 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2977,7 +2977,7 @@ test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])]) test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])]) test(991, unique(dt, by=NULL), dt[!duplicated(df)]) test(991.1, unique(dt, by=4), error="received non-existing column*.*4") -test(991.2, unique(dt, by=c(1,3.1)), error="is type 'double' and one or more items in it are not whole integers") +test(991.2, unique(dt, by=c(1,3.1)), error="In setkey, argument specifying columns is type 'double' and one or more items in it are not whole integers") test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])]) test(991.4, unique(dt, by=c('C','D','E')), error="received non-existing column*.*D") @@ -21962,9 +21962,13 @@ test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), # re-overallocate in set if quota is reached #496 #1831 #4100 DT = data.table() -test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i); ncol(DT)}, 10L) +test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j=paste0("V",i), value=i); DT}, data.table(V1=1L,V2=2L,V3=3L,V4=4L,V5=5L,V6=6L,V7=7L,V8=8L,V9=9L,V10=10L)) DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame")) test(2356.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3)) # ensure := and set are consistent if they need to overallocate DT = data.table(); DT2 = data.table() -test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = sprintf("V%d",i), value = i); DT}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]; DT2}) +test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j=paste0("V",i), value=i); for (i in seq(10L)) DT2[, (paste0("V",i)):=i]; fsetdiff(DT, DT2)}, data.table(V1=integer(),V2=integer(),V3=integer(),V4=integer(),V5=integer(),V6=integer(),V7=integer(),V8=integer(),V9=integer(),V10=integer())) + +# Test for context-aware error messages (Issue #5039) +test(2357, data.table:::colnamesInt(data.table(a=1), "missing", context="test_context"), + error="In test_context, argument specifying columns received non-existing column(s): cols[1]='missing'") diff --git a/src/data.table.h b/src/data.table.h index 434d0a340a..672bad163c 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -319,7 +319,7 @@ SEXP fitsInInt32R(SEXP x); bool fitsInInt64(SEXP x); SEXP fitsInInt64R(SEXP x); bool allNA(SEXP x, bool errorForBadType); -SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); +SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent, SEXP context); bool INHERITS(SEXP x, SEXP char_); SEXP copyAsPlain(SEXP x); void copySharedColumns(SEXP x); diff --git a/src/init.c b/src/init.c index 13421998b4..b5a7106f33 100644 --- a/src/init.c +++ b/src/init.c @@ -129,7 +129,7 @@ static const R_CallMethodDef callMethods[] = { {"CfrollfunR", (DL_FUNC)&frollfunR, -1}, {"CdllVersion", (DL_FUNC)&dllVersion, -1}, {"CnafillR", (DL_FUNC)&nafillR, -1}, - {"CcolnamesInt", (DL_FUNC)&colnamesInt, -1}, + {"CcolnamesInt", (DL_FUNC)&colnamesInt, 5}, {"CinitLastUpdated", (DL_FUNC)&initLastUpdated, -1}, {"Ccj", (DL_FUNC)&cj, -1}, {"Ccoalesce", (DL_FUNC)&coalesce, -1}, diff --git a/src/nafill.c b/src/nafill.c index 4187523c58..8a27638a2a 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -119,7 +119,12 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S obj = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list SET_VECTOR_ELT(obj, 0, obj1); } - SEXP ricols = PROTECT(colnamesInt(obj, cols, /* check_dups= */ ScalarLogical(TRUE), /* skip_absent= */ ScalarLogical(FALSE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) + // Step 1: Create and protect the source string + SEXP context_str = PROTECT(mkString("nafill")); protecti++; + + // Step 2: Call colnamesInt using that string and protect the result + SEXP ricols = PROTECT(colnamesInt(obj, cols, ScalarLogical(TRUE), ScalarLogical(FALSE), context_str)); + protecti++; x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++; int *icols = INTEGER(ricols); for (int i=0; i 0) { + ctx = CHAR(STRING_ELT(context, 0)); + } + int protecti = 0; R_len_t nx = length(x); R_len_t nc = length(cols); @@ -142,13 +147,13 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent) { ricols = cols; } else if (isReal(cols)) { if (!fitsInInt32(cols)) - error(_("argument specifying columns is type 'double' and one or more items in it are not whole integers")); + error(_("In %s, argument specifying columns is type 'double' and one or more items in it are not whole integers"), ctx); ricols = PROTECT(coerceVector(cols, INTSXP)); protecti++; } int *icols = INTEGER(ricols); for (int i=0; inx) || (icols[i]<1)) - error(_("argument specifying columns received non-existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also + error(_("In %s, argument specifying columns received non-existing column(s): cols[%d]=%d"), ctx, i+1, icols[i]);// handles NAs also else if(bskip_absent && icols[i]>nx) icols[i] = 0L; } @@ -161,14 +166,14 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent) { if (!bskip_absent) { for (int i=0; i