From 2649ed910e3e85e4aa2852cffb9be741cfcbbdd7 Mon Sep 17 00:00:00 2001
From: Ayush Singh <rawatsinghayush9720@gmail.com>
Date: Tue, 30 Dec 2025 10:33:39 +0000
Subject: [PATCH] Refactor colnamesInt to provide context-specific error
 messages #5039

---
 R/data.table.R        | 17 +++++++++--------
 R/duplicated.R        |  2 +-
 R/frank.R             |  2 +-
 R/mergelist.R         | 12 ++++++------
 R/setkey.R            |  2 +-
 R/setops.R            |  4 ++--
 R/wrappers.R          |  2 +-
 inst/tests/tests.Rraw | 10 +++++++---
 src/data.table.h      |  2 +-
 src/init.c            |  2 +-
 src/nafill.c          |  7 ++++++-
 src/utils.c           | 19 ++++++++++++-------
 12 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index 27c985e44c..ca6c69de48 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -495,8 +495,9 @@ replace_dot_alias = function(e) {
           allow.cartesian = TRUE
         }
         # TODO: collect all '==' ops first to speeden up Cnestedid
-        rightcols = colnamesInt(x, names(on), check_dups=FALSE)
-        leftcols  = colnamesInt(i, unname(on), check_dups=FALSE)
+        rightcols = colnamesInt(x, names(on), check_dups=FALSE, skip_absent=FALSE, context="x table's columns in on= join")
+        leftcols = colnamesInt(i, unname(on), check_dups=FALSE,
+                                skip_absent=FALSE, context="i table's columns in on= join")
       } else {
         ## missing on
         rightcols = chmatch(key(x), names_x)   # NAs here (i.e. invalid data.table) checked in bmerge()
@@ -2490,7 +2491,7 @@ na.omit.data.table = function(object, cols = seq_along(object), invert = FALSE,
   if (!cedta()) return(NextMethod()) # nocov
   if ( !missing(invert) && is.na(as.logical(invert)) )
     stopf("Argument 'invert' must be logical TRUE/FALSE")
-  cols = colnamesInt(object, cols, check_dups=FALSE)
+  cols = colnamesInt(object, cols, check_dups=FALSE, skip_absent=FALSE, context="setnames")
   ix = .Call(Cdt_na, object, cols)
   # forgot about invert with no NA case, #2660
   if (invert) {
@@ -2653,7 +2654,7 @@ copy = function(x) {
 
 .shallow = function(x, cols = NULL, retain.key = FALSE, unlock = FALSE) {
   wasnull = is.null(cols)
-  cols = colnamesInt(x, cols, check_dups=FALSE)
+  cols = colnamesInt(x, cols, check_dups=FALSE, skip_absent=FALSE, context="column selection")
   ans = .Call(Cshallowwrapper, x, cols)  # copies VECSXP only
 
   if(retain.key){
@@ -2840,12 +2841,12 @@ setcolorder = function(x, neworder=key(x), before=NULL, after=NULL, skip_absent=
     stopf("Provide either before= or after= but not both")
   if (length(before)>1L || length(after)>1L)
     stopf("before=/after= accept a single column name or number, not more than one")
-  neworder = colnamesInt(x, neworder, check_dups=FALSE, skip_absent=skip_absent)  # dups are now checked inside Csetcolorder below
+  neworder = colnamesInt(x, neworder, check_dups=FALSE, skip_absent=skip_absent, context="setcolorder")  # dups are now checked inside Csetcolorder below
   neworder = neworder[neworder != 0L] # tests 498.11, 498.13 fail w/o this
   if (length(before))
-    neworder = c(setdiff(seq_len(colnamesInt(x, before) - 1L), neworder), neworder)
+    neworder = c(setdiff(seq_len(colnamesInt(x, before, check_dups=FALSE, skip_absent=FALSE, context="setcolorder 'before'") - 1L), neworder), neworder)
   if (length(after))
-    neworder = c(setdiff(seq_len(colnamesInt(x, after)), neworder), neworder)
+    neworder = c(setdiff(seq_len(colnamesInt(x, after, check_dups=FALSE, skip_absent=FALSE, context="setcolorder 'after'")), neworder), neworder)
   if (length(neworder) != length(x)) {
     # pad by the missing elements (checks inside Csetcolorder catch other mistakes)
     neworder = c(neworder, setdiff(seq_along(x), neworder))
@@ -3144,7 +3145,7 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) {
   } else if (!length(cols)) {
     stopf("x is a list, 'cols' cannot be 0-length.")
   }
-  cols = colnamesInt(x, cols, check_dups=FALSE)
+  cols = colnamesInt(x, cols, check_dups=FALSE, skip_absent=FALSE, context="column deletion")
   ids = .Call(Crleid, x, cols)
   if (!is.null(prefix)) ids = paste0(prefix, ids)
   ids
diff --git a/R/duplicated.R b/R/duplicated.R
index e1a04c9822..106740af58 100644
--- a/R/duplicated.R
+++ b/R/duplicated.R
@@ -74,7 +74,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon
 ## unique.data.table has been refactored to simply call duplicated.data.table
 ## making the refactor unnecessary, but let's leave it here just in case
 .duplicated.helper = function(x, by) {
-  cols = colnamesInt(x, by, check_dups=FALSE)
+  cols = colnamesInt(x, by, check_dups=FALSE, skip_absent=FALSE, context="duplicated/unique")
   use.keyprefix = if (is.null(by)) FALSE else {
     haskey(x) &&
       length(by) <= length(key(x)) &&
diff --git a/R/frank.R b/R/frank.R
index 419f5ea414..acff6104b4 100644
--- a/R/frank.R
+++ b/R/frank.R
@@ -18,7 +18,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a
     cols = 1L
     x = as_list(x)
   } else {
-    cols = colnamesInt(x, cols, check_dups=TRUE)
+    cols = colnamesInt(x, cols, check_dups=TRUE, skip_absent=FALSE, context="frank")
     if (!length(cols))
       stopf("x is a list, 'cols' can not be 0-length")
   }
diff --git a/R/mergelist.R b/R/mergelist.R
index 7ff0e87bcf..97ec8fe8cb 100644
--- a/R/mergelist.R
+++ b/R/mergelist.R
@@ -26,9 +26,9 @@ onkeys = function(x, y) {
 
 # column index selection helper
 someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
-  keep = colnamesInt(x, keep)
-  drop = colnamesInt(x, drop)
-  cols = colnamesInt(x, cols)
+  keep = colnamesInt(x, keep, check_dups=FALSE, skip_absent=FALSE, context="merge(keep=)")
+  drop = colnamesInt(x, drop, check_dups=FALSE, skip_absent=FALSE, context="merge(drop=)")
+  cols = colnamesInt(x, cols, check_dups=FALSE, skip_absent=FALSE, context="merge(cols=)")
   ans = union(keep, setdiff(cols, drop))
   if (!retain.order) return(ans)
   sort(ans)
@@ -104,8 +104,8 @@ dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
       stopf("'on' must be non-zero length character vector")
     if (mult == "all" && (how == "semi" || how == "anti"))
       stopf("semi and anti joins must be used with mult!='all'")
-    icols = colnamesInt(i, on, check_dups=TRUE)
-    xcols = colnamesInt(x, on, check_dups=TRUE)
+    icols = colnamesInt(i, on, check_dups=TRUE, skip_absent=FALSE, context="merge 'i' table columns")
+    xcols = colnamesInt(x, on, check_dups=TRUE, skip_absent=FALSE, context="merge 'x' table columns")
     ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
     if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
       return(invisible(NULL))
@@ -338,7 +338,7 @@ mergelist_impl_ = function(l, on, cols, how, mult, join.many, copy) {
   }
   out.mem = vapply_1c(out, address)
   if (copy)
-    .Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))]))
+    .Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))], check_dups=FALSE, skip_absent=FALSE, context="merge column copy"))
   if (verbose)
     catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]] - p)
   out
diff --git a/R/setkey.R b/R/setkey.R
index 4ba5be4d71..11ee294cc8 100644
--- a/R/setkey.R
+++ b/R/setkey.R
@@ -149,7 +149,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE,
     by = NULL
   } else {
     if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L))
-    by = colnamesInt(x, by, check_dups=FALSE)
+    by = colnamesInt(x, by, check_dups=FALSE, skip_absent=FALSE, context="setkey")
   }
   order = as.integer(order) # length and contents of order being +1/-1 is checked at C level
   .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting)  # returns integer() if already sorted, regardless of sort=TRUE|FALSE
diff --git a/R/setops.R b/R/setops.R
index 4ae78b193d..a3765b25ab 100644
--- a/R/setops.R
+++ b/R/setops.R
@@ -3,9 +3,9 @@ setdiff_ = function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE)
   if (!is.data.table(x) || !is.data.table(y)) stopf("x and y must both be data.tables")
   # !ncol redundant since all 0-column data.tables have 0 rows
   if (!nrow(x)) return(x)
-  by.x = colnamesInt(x, by.x, check_dups=TRUE)
+  by.x = colnamesInt(x, by.x, check_dups=TRUE, skip_absent=FALSE, context="set operation 'x' columns")
   if (!nrow(y)) return(unique(x, by=by.x))
-  by.y = colnamesInt(y, by.y, check_dups=TRUE)
+  by.y = colnamesInt(y, by.y, check_dups=TRUE, skip_absent=FALSE, context="set operation 'y' columns")
   if (length(by.x) != length(by.y)) stopf("length(by.x) != length(by.y)", domain=NA)
   # factor in x should've factor/character in y, and vice-versa
   for (a in seq_along(by.x)) {
diff --git a/R/wrappers.R b/R/wrappers.R
index c19c1b1eea..89160c2f2d 100644
--- a/R/wrappers.R
+++ b/R/wrappers.R
@@ -13,7 +13,7 @@ fcase   = function(..., default=NA) {
   .Call(CfcaseR, parent.frame(), arg_list)
 }
 
-colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(CcolnamesInt, x, cols, check_dups, skip_absent)
+colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE, context=NULL) .Call(CcolnamesInt, x, cols, check_dups, skip_absent, context)
 
 testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L])
 
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 8cad916e3b..f81f411d45 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -2977,7 +2977,7 @@ test(989, unique(dt, by='C'), dt[!duplicated(df[, 'C'])])
 test(990, unique(dt, by=c('B', 'C')), dt[!duplicated(df[, c('B', 'C')])])
 test(991, unique(dt, by=NULL), dt[!duplicated(df)])
 test(991.1, unique(dt, by=4), error="received non-existing column*.*4")
-test(991.2, unique(dt, by=c(1,3.1)), error="is type 'double' and one or more items in it are not whole integers")
+test(991.2, unique(dt, by=c(1,3.1)), error="In setkey, argument specifying columns is type 'double' and one or more items in it are not whole integers")
 test(991.3, unique(dt, by=2:3), dt[!duplicated(df[,c('B','C')])])
 test(991.4, unique(dt, by=c('C','D','E')), error="received non-existing column*.*D")
 
@@ -21962,9 +21962,13 @@ test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"),
 
 # re-overallocate in set if quota is reached #496 #1831 #4100
 DT = data.table()
-test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = paste0("V",i), value = i); ncol(DT)}, 10L)
+test(2356.1, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j=paste0("V",i), value=i); DT}, data.table(V1=1L,V2=2L,V3=3L,V4=4L,V5=5L,V6=6L,V7=7L,V8=8L,V9=9L,V10=10L))
 DT = structure(list(a = 1, b = 2), class = c("data.table", "data.frame"))
 test(2356.2, options=c(datatable.alloccol=1L), set(DT, j="c", value=3), data.table(a=1, b=2, c=3))
 # ensure := and set are consistent if they need to overallocate
 DT = data.table(); DT2 = data.table()
-test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j = sprintf("V%d",i), value = i); DT}, {for (i in seq(10)) DT2[, sprintf("V%d",i) := i]; DT2})
+test(2356.3, options=c(datatable.alloccol=1L), {for (i in seq(10L)) set(DT, j=paste0("V",i), value=i); for (i in seq(10L)) DT2[, (paste0("V",i)):=i]; fsetdiff(DT, DT2)}, data.table(V1=integer(),V2=integer(),V3=integer(),V4=integer(),V5=integer(),V6=integer(),V7=integer(),V8=integer(),V9=integer(),V10=integer()))
+
+# Test for context-aware error messages (Issue #5039)
+test(2357, data.table:::colnamesInt(data.table(a=1), "missing", context="test_context"), 
+     error="In test_context, argument specifying columns received non-existing column(s): cols[1]='missing'")
diff --git a/src/data.table.h b/src/data.table.h
index 434d0a340a..672bad163c 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -319,7 +319,7 @@ SEXP fitsInInt32R(SEXP x);
 bool fitsInInt64(SEXP x);
 SEXP fitsInInt64R(SEXP x);
 bool allNA(SEXP x, bool errorForBadType);
-SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent);
+SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent, SEXP context);
 bool INHERITS(SEXP x, SEXP char_);
 SEXP copyAsPlain(SEXP x);
 void copySharedColumns(SEXP x);
diff --git a/src/init.c b/src/init.c
index 13421998b4..b5a7106f33 100644
--- a/src/init.c
+++ b/src/init.c
@@ -129,7 +129,7 @@ static const R_CallMethodDef callMethods[] = {
     {"CfrollfunR", (DL_FUNC)&frollfunR, -1},
     {"CdllVersion", (DL_FUNC)&dllVersion, -1},
     {"CnafillR", (DL_FUNC)&nafillR, -1},
-    {"CcolnamesInt", (DL_FUNC)&colnamesInt, -1},
+    {"CcolnamesInt", (DL_FUNC)&colnamesInt, 5},
     {"CinitLastUpdated", (DL_FUNC)&initLastUpdated, -1},
     {"Ccj", (DL_FUNC)&cj, -1},
     {"Ccoalesce", (DL_FUNC)&coalesce, -1},
diff --git a/src/nafill.c b/src/nafill.c
index 4187523c58..8a27638a2a 100644
--- a/src/nafill.c
+++ b/src/nafill.c
@@ -119,7 +119,12 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
     obj = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list
     SET_VECTOR_ELT(obj, 0, obj1);
   }
-  SEXP ricols = PROTECT(colnamesInt(obj, cols, /* check_dups= */ ScalarLogical(TRUE), /* skip_absent= */ ScalarLogical(FALSE))); protecti++; // nafill cols=NULL which turns into seq_along(obj)
+  // Step 1: Create and protect the source string
+  SEXP context_str = PROTECT(mkString("nafill")); protecti++;
+
+  // Step 2: Call colnamesInt using that string and protect the result
+  SEXP ricols = PROTECT(colnamesInt(obj, cols, ScalarLogical(TRUE), ScalarLogical(FALSE), context_str));
+  protecti++;
   x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++;
   int *icols = INTEGER(ricols);
   for (int i=0; i<length(ricols); i++) {
diff --git a/src/utils.c b/src/utils.c
index e11bc09d57..083c6cdeea 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -116,13 +116,18 @@ SEXP allNAR(SEXP x) {
  *   optionally (check_dups) check for no duplicates
  *   optionally (skip_absent) skip (return 0) for numbers outside the range or not naming extant columns
  */
-SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent) {
+SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent, SEXP context) {
   if (!isNewList(x))
     error(_("'x' argument must be data.table compatible"));
   if (!IS_TRUE_OR_FALSE(check_dups))
     error(_("%s must be TRUE or FALSE"), "check_dups");
   if (!IS_TRUE_OR_FALSE(skip_absent))
     error(_("%s must be TRUE or FALSE"), "skip_absent");
+  const char *ctx = "unknown";
+  if (context != R_NilValue && isString(context) && length(context) > 0) {
+    ctx = CHAR(STRING_ELT(context, 0));
+  }
+
   int protecti = 0;
   R_len_t nx = length(x);
   R_len_t nc = length(cols);
@@ -142,13 +147,13 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent) {
         ricols = cols;
     } else if (isReal(cols)) {
       if (!fitsInInt32(cols))
-        error(_("argument specifying columns is type 'double' and one or more items in it are not whole integers"));
+  error(_("In %s, argument specifying columns is type 'double' and one or more items in it are not whole integers"), ctx);
       ricols = PROTECT(coerceVector(cols, INTSXP)); protecti++;
     }
     int *icols = INTEGER(ricols);
     for (int i=0; i<nc; ++i) {
       if ((!bskip_absent && icols[i]>nx) || (icols[i]<1))
-        error(_("argument specifying columns received non-existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also
+  error(_("In %s, argument specifying columns received non-existing column(s): cols[%d]=%d"), ctx, i+1, icols[i]);// handles NAs also
       else if(bskip_absent && icols[i]>nx)
         icols[i] = 0L;
     }
@@ -161,14 +166,14 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent) {
     if (!bskip_absent) {
       for (int i=0; i<nc; ++i) {
         if (icols[i]==0)
-          error(_("argument specifying columns received non-existing column(s): cols[%d]='%s'"), i+1, CHAR(STRING_ELT(cols, i))); // handles NAs also
+  error(_("In %s, argument specifying columns received non-existing column(s): cols[%d]='%s'"), ctx, i+1, CHAR(STRING_ELT(cols, i)));// handles NAs also
       }
     }
   } else {
-    error(_("argument specifying columns must be character or numeric"));
-  }
+  error(_("In %s, argument specifying columns must be character or numeric"), ctx);
+}
   if (LOGICAL(check_dups)[0] && any_duplicated(ricols, FALSE))
-    error(_("argument specifying columns received duplicate column(s)"));
+  error(_("In %s, argument specifying columns received duplicate column(s)"), ctx);
   UNPROTECT(protecti);
   return ricols;
 }