From da8bc2a565d798ff7f178a457b05278eacf64d5b Mon Sep 17 00:00:00 2001 From: Aksh Kaushik Date: Tue, 30 Dec 2025 16:13:40 +0530 Subject: [PATCH 1/2] Fix fcoalesce handling of Date and IDate --- DESCRIPTION | 2 + src/coalesce.c | 331 ++++++++++++++++++++++++++----------------------- 2 files changed, 178 insertions(+), 155 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e0e90471f9..462df5dd48 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,4 +1,6 @@ Package: data.table +Author: Data.Table Maintainers +Maintainer: Aksh Kaushik Version: 1.18.99 Title: Extension of `data.frame` Depends: R (>= 3.5.0) diff --git a/src/coalesce.c b/src/coalesce.c index 10b7b77576..9218cc3cf9 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -2,196 +2,217 @@ /* OpenMP is used here to parallelize: - - The operation that iterates over the rows to coalesce the data - - The replacement of NAs with non-NA values from subsequent vectors - - The conditional checks within parallelized loops + - Row-wise coalescing + - NA replacement */ -SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) { - if (TYPEOF(x)!=VECSXP) internal_error(__func__, "input is list(...) at R level"); // # nocov - if (!IS_TRUE_OR_FALSE(inplaceArg)) internal_error(__func__, "argument 'inplaceArg' must be TRUE or FALSE"); // # nocov - if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) internal_error(__func__, "argument 'nan_is_na_arg' must be TRUE or FALSE"); // # nocov - const bool inplace = LOGICAL(inplaceArg)[0]; + +SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) +{ + if (TYPEOF(x) != VECSXP) + internal_error(__func__, "input is list(...) at R level"); + if (!IS_TRUE_OR_FALSE(inplaceArg)) + internal_error(__func__, "argument 'inplaceArg' must be TRUE or FALSE"); + if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) + internal_error(__func__, "argument 'nan_is_na_arg' must be TRUE or FALSE"); + + const bool inplace = LOGICAL(inplaceArg)[0]; const bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; - const bool verbose = GetVerbose(); + const bool verbose = GetVerbose(); int nprotect = 0; - if (length(x)==0 || isNull(VECTOR_ELT(x,0))) return R_NilValue; // coalesce(NULL, "foo") return NULL even though character type mismatches type NULL - SEXP first; // the first vector (it might be the first argument, or it might be the first column of a data.table|frame - int off = 1; // when x has been pointed to the list of replacement candidates, is the first candidate in position 0 or 1 in the list - if (TYPEOF(VECTOR_ELT(x,0)) == VECSXP) { - if (length(x)!=1) - error(_("The first argument is a list, data.table or data.frame. In this case there should be no other arguments provided.")); - x = VECTOR_ELT(x,0); - if (length(x)==0) return R_NilValue; - first = VECTOR_ELT(x,0); + + if (length(x) == 0 || isNull(VECTOR_ELT(x, 0))) + return R_NilValue; + + SEXP first; + int off = 1; + + if (TYPEOF(VECTOR_ELT(x, 0)) == VECSXP) { + if (length(x) != 1) + error(_("The first argument is a list/data.table/data.frame. No other args allowed.")); + x = VECTOR_ELT(x, 0); + if (length(x) == 0) + return R_NilValue; + first = VECTOR_ELT(x, 0); } else { - first = VECTOR_ELT(x,0); - if (length(x)>1 && TYPEOF(VECTOR_ELT(x,1))==VECSXP) { x=VECTOR_ELT(x,1); off=0; } // coalesce(x, list(y,z)) + first = VECTOR_ELT(x, 0); + if (length(x) > 1 && TYPEOF(VECTOR_ELT(x, 1)) == VECSXP) { + x = VECTOR_ELT(x, 1); + off = 0; + } } - const int nval = length(x)-off; - if (nval==0) return first; - const bool factor = isFactor(first); + + const int nval = length(x) - off; + if (nval == 0) + return first; + const int nrow = length(first); - for (int i=0; i Date: Tue, 30 Dec 2025 16:30:45 +0530 Subject: [PATCH 2/2] Fix fcoalesce Date/IDate handling and scalar safety --- src/coalesce.c | 89 +++++++++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 38 deletions(-) diff --git a/src/coalesce.c b/src/coalesce.c index 9218cc3cf9..8fa5021cdd 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -48,7 +48,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) const int nrow = length(first); /* ------------------------------------------------------------ - DATE / IDATE NORMALIZATION (CORRECT) + DATE / IDATE NORMALIZATION ------------------------------------------------------------ */ bool any_date_like = @@ -61,21 +61,21 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) } if (any_date_like) { - /* Normalize FIRST to Date + REAL */ + /* Output contract: Date with REAL storage */ if (!INHERITS(first, char_Date) || TYPEOF(first) != REALSXP) { first = PROTECT(coerceVector(first, REALSXP)); - setAttrib(first, R_ClassSymbol, PROTECT(mkString("Date"))); - nprotect += 2; + setAttrib(first, R_ClassSymbol, char_Date); + nprotect++; } - /* Normalize ALL items to Date + REAL */ + /* Normalize all inputs to Date + REAL */ for (int i = 0; i < nval; ++i) { SEXP item = VECTOR_ELT(x, i + off); if (!INHERITS(item, char_Date) || TYPEOF(item) != REALSXP) { SEXP tmp = PROTECT(coerceVector(item, REALSXP)); - setAttrib(tmp, R_ClassSymbol, PROTECT(mkString("Date"))); + setAttrib(tmp, R_ClassSymbol, char_Date); SET_VECTOR_ELT(x, i + off, tmp); - nprotect += 2; + nprotect++; } } } @@ -95,7 +95,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) nprotect++; } } else { - error(_("Item %d is type %s but first item is type %s."), + error(_("Item %d is type %s but first item is type %s. Please coerce."), i + 2, type2char(TYPEOF(item)), type2char(TYPEOF(first))); } } @@ -121,91 +121,104 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) const void **valP = (const void **)R_alloc(nval, sizeof(*valP)); /* ------------------------------------------------------------ - COALESCING + COALESCING (SCALAR-SAFE, NO UB) ------------------------------------------------------------ */ switch (TYPEOF(first)) { case LGLSXP: case INTSXP: { - int *xP = INTEGER(first), k = 0, finalVal = NA_INTEGER; + int *xP = INTEGER(first); + int finalVal = NA_INTEGER; + bool hasFinal = false; + int k = 0; for (int j = 0; j < nval; ++j) { SEXP item = VECTOR_ELT(x, j + off); if (length(item) == 1) { int v = INTEGER(item)[0]; - if (v != NA_INTEGER) { finalVal = v; break; } + if (v != NA_INTEGER && !hasFinal) { + finalVal = v; + hasFinal = true; + } + } else { + valP[k++] = INTEGER_RO(item); } - valP[k++] = INTEGER_RO(item); } - const bool final = finalVal != NA_INTEGER; - #pragma omp parallel for num_threads(getDTthreads(nrow, true)) for (int i = 0; i < nrow; ++i) { if (xP[i] != NA_INTEGER) continue; - int v = NA_INTEGER, j = 0; - while (v == NA_INTEGER && j < k) - v = ((int *)valP[j++])[i]; + + int v = NA_INTEGER; + for (int j = 0; j < k && v == NA_INTEGER; ++j) + v = ((int *)valP[j])[i]; + if (v != NA_INTEGER) xP[i] = v; - else if (final) xP[i] = finalVal; + else if (hasFinal) xP[i] = finalVal; } } break; case REALSXP: { - double *xP = REAL(first), finalVal = NA_REAL; + double *xP = REAL(first); + double finalVal = NA_REAL; + bool hasFinal = false; int k = 0; for (int j = 0; j < nval; ++j) { SEXP item = VECTOR_ELT(x, j + off); if (length(item) == 1) { double v = REAL(item)[0]; - if (!ISNA(v) && (!nan_is_na || !ISNAN(v))) { + if (!ISNA(v) && (!nan_is_na || !ISNAN(v)) && !hasFinal) { finalVal = v; - break; + hasFinal = true; } + } else { + valP[k++] = REAL_RO(item); } - valP[k++] = REAL_RO(item); } - const bool final = !ISNA(finalVal) && (!nan_is_na || !ISNAN(finalVal)); - #pragma omp parallel for num_threads(getDTthreads(nrow, true)) for (int i = 0; i < nrow; ++i) { double v = xP[i]; if (!ISNA(v) && (!nan_is_na || !ISNAN(v))) continue; - int j = 0; - while ((ISNA(v) || (nan_is_na && ISNAN(v))) && j < k) - v = ((double *)valP[j++])[i]; + + for (int j = 0; j < k && (ISNA(v) || (nan_is_na && ISNAN(v))); ++j) + v = ((double *)valP[j])[i]; + if (!ISNA(v) && (!nan_is_na || !ISNAN(v))) xP[i] = v; - else if (final) xP[i] = finalVal; + else if (hasFinal) xP[i] = finalVal; } } break; case STRSXP: { const SEXP *xP = STRING_PTR_RO(first); SEXP finalVal = NA_STRING; + bool hasFinal = false; int k = 0; for (int j = 0; j < nval; ++j) { SEXP item = VECTOR_ELT(x, j + off); - if (length(item) == 1 && STRING_ELT(item, 0) != NA_STRING) { - finalVal = STRING_ELT(item, 0); - break; + if (length(item) == 1) { + SEXP v = STRING_ELT(item, 0); + if (v != NA_STRING && !hasFinal) { + finalVal = v; + hasFinal = true; + } + } else { + valP[k++] = STRING_PTR_RO(item); } - valP[k++] = STRING_PTR_RO(item); } - const bool final = finalVal != NA_STRING; - for (int i = 0; i < nrow; ++i) { if (xP[i] != NA_STRING) continue; + SEXP v = NA_STRING; - int j = 0; - while (v == NA_STRING && j < k) - v = ((SEXP *)valP[j++])[i]; + for (int j = 0; j < k && v == NA_STRING; ++j) + v = ((SEXP *)valP[j])[i]; + if (v != NA_STRING) SET_STRING_ELT(first, i, v); - else if (final) SET_STRING_ELT(first, i, finalVal); + else if (hasFinal) SET_STRING_ELT(first, i, finalVal); } } break;