diff --git a/NAMESPACE b/NAMESPACE index d712b318ab..67993db4c7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(setindex, setindexv, indices) export(as.data.table,is.data.table,test.data.table) export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%") export(timetaken) -export(truelength, setalloccol, alloc.col, ":=", let) +export(truelength, setalloccol, setallocrow, alloc.col, ":=", let) export(setattr, setnames, setcolorder, set, setDT, setDF) export(setorder, setorderv) export(setNumericRounding, getNumericRounding) @@ -28,7 +28,7 @@ export(tstrsplit) export(frank) export(frankv) export(address) -export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI, measure, measurev, patterns) +export(.SD,.N,.I,.GRP,.NGRP,.BY,.EACHI,.ROW, measure, measurev, patterns) # TODO(#6197): Export these. # export(., J) export(rleid) diff --git a/R/data.table.R b/R/data.table.R index f05220a62b..9ba17b7c9b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -11,7 +11,7 @@ methods::setPackageName("data.table",.global) # (1) add to man/special-symbols.Rd # (2) export() in NAMESPACE # (3) add to vignettes/datatable-importing.Rmd#globals section -.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = NULL +.SD = .N = .I = .GRP = .NGRP = .BY = .EACHI = .ROW = NULL # These are exported to prevent NOTEs from R CMD check, and checkUsage via compiler. # But also exporting them makes it clear (to users and other packages) that data.table uses these as symbols. # And NULL makes it clear (to the R's mask check on loading) that they're variables not functions. @@ -1182,6 +1182,21 @@ replace_dot_alias = function(e) { names(jsub)="" jsub[[1L]]=as.name("list") } + + # Check for .ROW := NULL pattern (delete rows by reference) + if ((is.character(lhs) && length(lhs)==1L && lhs==".ROW") || + (is.name(lhs) && identical(lhs, quote(.ROW)))) { + if (is.null(jsub) || identical(jsub, quote(NULL))) { + if (is.null(irows)) + stopf(".ROW := NULL requires i= condition to specify rows to delete") + if (!missingby) + stopf(".ROW := NULL with 'by' or 'keyby' is not supported yet") + .Call(CdeleteRows, x, irows) + return(suppPrint(x)) + } else { + stopf(".ROW can only be used with := NULL to delete rows") + } + } av = all.vars(jsub,TRUE) if (!is.atomic(lhs)) stopf("LHS of := must be a symbol, or an atomic vector (column names or positions).") if (is.character(lhs)) { @@ -2719,6 +2734,10 @@ selfrefok = function(DT,verbose=getOption("datatable.verbose")) { .Call(Cselfrefokwrapper,DT,verbose) } +setallocrow = function(DT, n=0L) { + invisible(.Call(Callocrowwrapper, DT, as.integer(n))) +} + truelength = function(x) .Call(Ctruelength,x) # deliberately no "truelength<-" method. setalloccol is the mechanism for that. # settruelength() no longer need (and so removed) now that data.table depends on R 2.14.0 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c7205e52ae..8c2de805f5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21959,3 +21959,96 @@ test(2355.1, fread(txt, skip=0), data.table(V1 = c("b1", "c1"), a1 test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name") test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE") test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3"))) + +# delete rows by reference #635 +# atomic types and list columns +dt = data.table( + int = 1:5, + real = c(1.1, 2.2, 3.3, 4.4, 5.5), + char = letters[1:5], + lgl = c(TRUE, FALSE, TRUE, FALSE, TRUE), + cplx = as.complex(1:5), + raw_col = as.raw(1:5), + list_col = list(1L, 1:2, 1:3, 1:4, 1:5) +) +test(2356.01, copy(dt)[1L, .ROW := NULL], dt[-1]) +test(2356.02, copy(dt)[1, .ROW := NULL], dt[-1]) +test(2356.03, copy(dt)[c(TRUE, FALSE, FALSE, TRUE, FALSE), .ROW := NULL], dt[-c(1,4)]) +test(2356.04, copy(dt)[int==1L, .ROW := NULL], dt[-1]) +test(2356.05, copy(dt)[int<2L, .ROW := NULL], dt[-1]) +test(2356.06, copy(dt)[-1, .ROW := NULL], dt[1]) +# zero row or empty data.tables +dt = data.table() +test(2356.07, dt[logical(0), .ROW := NULL], dt) +dt = data.table(a=integer(0), b=character(0)) +test(2356.08, dt[logical(0), .ROW := NULL], dt) +# multirow +dt = data.table(a=1:5, b=letters[1:5]) +test(2356.09, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)]) +test(2356.10, copy(dt)[c(TRUE, FALSE, TRUE, FALSE, TRUE), .ROW := NULL], dt[c(2,4)]) +test(2356.11, copy(dt)[1:2, .ROW := NULL], dt[3:5]) +test(2356.12, copy(dt)[1:5, .ROW := NULL], dt[0]) +# NA handling and edges case +dt = data.table(a=1:5, b=letters[1:5]) +test(2356.13, copy(dt)[c(1L, NA_integer_, 3L), .ROW := NULL], dt[c(2,4,5)]) +test(2356.14, copy(dt)[c(NA_integer_, NA_integer_), .ROW := NULL], dt) +test(2356.15, copy(dt)[c(TRUE, NA, FALSE, NA, TRUE), .ROW := NULL], dt[c(2,3,4)]) +test(2356.16, copy(dt)[integer(0), .ROW := NULL], dt) +test(2356.17, copy(dt)[logical(0), .ROW := NULL], dt) +test(2356.18, copy(dt)[c(FALSE, FALSE, FALSE, FALSE, FALSE), .ROW := NULL], dt) +test(2356.19, copy(dt)[a > 100, .ROW := NULL], dt) # no matches +# Duplicate indices +dt = data.table(a=1:5, b=letters[1:5]) +test(2356.20, copy(dt)[c(1L, 1L), .ROW := NULL], dt[-1]) +test(2356.21, copy(dt)[c(1L, 1L, 2L, 2L), .ROW := NULL], dt[3:5]) +test(2356.22, copy(dt)[c(3L, 1L, 3L, 1L), .ROW := NULL], dt[c(2,4,5)]) +# integer64 +if (test_bit64) { + dt = data.table(a=1:5, b=as.integer64(11:15)) + test(2356.23, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)]) + test(2356.24, copy(dt)[1:5, .ROW := NULL], data.table(a=integer(0), b=integer64(0))) +} +# Date/IDate/ITime columns +dt = data.table(a=1:5, d=as.Date("2024-01-01") + 0:4, t=as.ITime(paste0(10:14, ":00:00")), dt=as.POSIXct("2024-01-01 12:00:00") + 3600*0:4) +test(2356.25, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)]) +test(2356.26, copy(dt)[c(2L, 4L), .ROW := NULL]$d, as.Date("2024-01-01") + c(0,2,4)) +# Factor columns +dt = data.table(a=1:5, f=factor(letters[1:5], levels=letters[1:10])) +test(2356.27, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)]) +test(2356.28, levels(copy(dt)[c(1L, 3L), .ROW := NULL]$f), letters[1:10]) +dt = data.table(a=1:5, of=ordered(letters[1:5], levels=letters[5:1])) +test(2356.29, copy(dt)[c(2L, 4L), .ROW := NULL], dt[-c(2L,4L)]) +test(2356.30, is.ordered(copy(dt)[c(2, 4L), .ROW := NULL]$of)) +# Keys - should be cleared after deletion +dt = data.table(a=5:1, b=letters[1:5], key="a") +test(2356.31, key(copy(dt)[1L, .ROW := NULL]), NULL) +test(2356.32, haskey(copy(dt)[1L, .ROW := NULL]), FALSE) +# Indices - should be cleared after deletion +dt = data.table(a=1:5, b=letters[1:5], c=5:1) +setindex(dt, b) +test(2356.33, indices(copy(dt)[1L, .ROW := NULL]), NULL) +# row names +dt = data.table(a=1:5, b=letters[1:5]) +test(2356.34, attr(copy(dt)[c(1L, 3L), .ROW := NULL], "row.names"), 1:3) +# selfref check +test(2356.35, selfrefok(copy(dt)[1L, .ROW := NULL]), 1L) +# errors +dt = data.table(a=1:4, g=1:2) +test(2356.36, dt[1L, .ROW := 1L], error=".ROW can only be used with := NULL") +test(2356.37, dt[1L, .ROW := "delete"], error=".ROW can only be used with := NULL") +test(2356.38, dt[1L, .ROW := FALSE], error=".ROW can only be used with := NULL") +test(2356.39, dt[, .ROW := NULL], error=".ROW := NULL requires i= condition") +test(2356.40, dt[1L, .ROW := NULL, by=g], error=".ROW := NULL with 'by' or 'keyby' is not supported") +# large table +dt = data.table(a=1:20000, b=rep(letters, length.out=20000)) +idx = seq(1L, 20000L, by=2L) +test(2356.41, copy(dt)[idx, .ROW := NULL], dt[-idx]) +# Chaining and complexer i expressions +dt = data.table(a=1:10, b=letters[1:10]) +test(2356.42, copy(dt)[a>2, .ROW := NULL][b=="a"], data.table(a=1L, b="a")) +test(2356.43, copy(dt)[a %% 2 == 0, .ROW := NULL], dt[a %% 2 != 0]) +test(2356.44, copy(dt)[!(a < 5 & b != "d"), .ROW := NULL], dt[1:3]) +# make columns resizable +dt = data.table(a=1:3) +test(2356.91, truelength(dt$a), 0L) +test(2356.92, {setallocrow(dt); truelength(dt$a)}, 3L) diff --git a/man/assign.Rd b/man/assign.Rd index b0c038349a..17a9af374a 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -56,12 +56,15 @@ set(x, i = NULL, j, value) DT[i, colC := mean(colB), by = colA] # update (or add) column called "colC" by reference by group. A major feature of `:=`. DT[,`:=`(new1 = sum(colB), new2 = sum(colC))] # Functional form DT[, let(new1 = sum(colB), new2 = sum(colC))] # New alias for functional form. + DT[i, .ROW := NULL] # delete rows by reference. } The \code{\link{.Last.updated}} variable contains the number of rows updated by the most recent \code{:=} or \code{set} calls, which may be useful, for example, in production settings for testing assumptions about the number of rows affected by a statement; see \code{\link{.Last.updated}} for details. Note that for efficiency no check is performed for duplicate assignments, i.e. if multiple values are passed for assignment to the same index, assignment to this index will occur repeatedly and sequentially; for a given use case, consider whether it makes sense to create your own test for duplicates, e.g. in production code. +Note that \code{.ROW := NULL} is a special case used to delete rows by reference. Unlike column assignment, this requires an \code{i} expression to specify which rows to delete, and does not support \code{by} or \code{keyby}. See \code{\link{.ROW}} or \code{\link{special-symbols}} for details. + All of the following result in a friendly error (by design) : \preformatted{ @@ -158,6 +161,13 @@ set(DT, j = c("b", "d"), value = list(200L, 300L)) ## Set values for multiple columns with multiple specified rows. set(DT, c(1L, 3L), c("b", "d"), value = list(500L, 800L)) +# Delete rows by reference +DT = data.table(a=1:10, b=letters[1:10]) +DT[c(2,4,6), .ROW := NULL] # delete rows 2, 4, and 6 +DT +DT[a>5, .ROW := NULL] # delete rows where a>5 +DT + \dontrun{ # Speed example: diff --git a/man/special-symbols.Rd b/man/special-symbols.Rd index 64e251e79d..57df26ac7c 100644 --- a/man/special-symbols.Rd +++ b/man/special-symbols.Rd @@ -9,10 +9,12 @@ \alias{.EACHI} \alias{.NGRP} \alias{.NATURAL} +\alias{.ROW} \title{ Special symbols } \description{ \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}, \code{.NATURAL} is a symbol passed to \code{on}; i.e. \code{on=.NATURAL} + \code{.ROW} is a symbol used with \code{:= NULL} to delete rows by reference; i.e. \code{DT[i, .ROW := NULL]} deletes the rows selected by \code{i}. } \details{ The bindings of these variables are locked and attempting to assign to them will generate an error. If you wish to manipulate \code{.SD} before returning it, take a \code{\link{copy}(.SD)} first (see FAQ 4.5). Using \code{:=} in the \code{j} of \code{.SD} is reserved for future use as a (tortuously) flexible way to update \code{DT} by reference by group (even when groups are not contiguous in an ad hoc by). @@ -32,6 +34,8 @@ \code{.NATURAL} is defined as \code{NULL} but its value is not used. Its usage is \code{on=.NATURAL} (alternative of \code{X[on=Y]}) which joins two tables on their common column names, performing a natural join; see \code{\link{data.table}}'s \code{on} argument for more details. + \code{.ROW} is a symbol that can only be used with \code{:= NULL} to delete rows by reference. When you use \code{DT[i, .ROW := NULL]}, the rows matching the \code{i} expression are removed from \code{DT} in-place. This is an efficient way to delete rows without copying the entire data.table. The \code{i} argument is required and \code{by}/\code{keyby} are not supported. After deletion, any keys and indices on \code{DT} are cleared. See \code{\link{:=}} for more on reference semantics. + Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. Note also that you should consider these symbols read-only and of limited scope -- internal data.table code might manipulate them in unexpected ways, and as such their bindings are locked. There are subtle ways to wind up with the wrong object, especially when attempting to copy their values outside a grouping context. See examples; when in doubt, \code{copy()} is your friend. @@ -72,5 +76,12 @@ DT[, .(min(.SD[,-1])), by=.I] # Do not expect this to correctly append the value of .BY in each group; copy(.BY) will work. by_tracker = list() DT[, { append(by_tracker, .BY); sum(v) }, by=x] + +# .ROW to delete rows by reference +DT = data.table(a=1:5, b=letters[1:5]) +DT[c(2,4), .ROW := NULL] +DT +DT[a>2, .ROW := NULL] +DT } \keyword{ data } diff --git a/man/truelength.Rd b/man/truelength.Rd index a85f78b1b6..73288f05d8 100644 --- a/man/truelength.Rd +++ b/man/truelength.Rd @@ -2,6 +2,7 @@ \alias{truelength} \alias{setalloccol} \alias{alloc.col} +\alias{setallocrow} \title{ Over-allocation access } \description{ These functions are experimental and somewhat advanced. By \emph{experimental} we mean their names might change and perhaps the syntax, argument names and types. So if you write a lot of code using them, you have been warned! They should work and be stable, though, so please report problems with them. \code{alloc.col} is just an alias to \code{setalloccol}. We recommend to use \code{setalloccol} (though \code{alloc.col} will continue to be supported) because the \code{set*} prefix in \code{setalloccol} makes it clear that its input argument is modified in-place. @@ -14,11 +15,14 @@ setalloccol(DT, alloc.col(DT, n = getOption("datatable.alloccol"), # default: 1024L verbose = getOption("datatable.verbose")) # default: FALSE +setallocrow(DT, n = 0L) } \arguments{ \item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. } \item{DT}{ A \code{data.table}. } -\item{n}{ The number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. } +\item{n}{ For \code{setalloccol} and \code{alloc.col}: the number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. + + For \code{setallocrow}: the number of rows to over-allocate. If \code{n > 0}, allocates capacity for current rows plus \code{n} additional rows. If \code{n == 0} (default), shrinks columns to exact current size to free excess memory. } \item{verbose}{ Output status and information. } } \details{ @@ -34,6 +38,12 @@ alloc.col(DT, (perhaps in your .Rprofile); e.g., \code{options(datatable.alloccol=10000L)}. Please note: over-allocation of the column pointer vector is not for efficiency \emph{per se}; it is so that \code{:=} can add columns by reference without a shallow copy. + + \code{setallocrow} is a utility function that prepares columns for fast row operations (delete or insert (not implemented yet)) by reference and manages row capacity. + Before deleting or inserting rows by reference, columns must be resizable. + \code{setallocrow} ensures all columns are in the appropriate state by converting ALTREP columns to materialized form and reallocating + columns to have the target capacity. When \code{n > 0}, columns are over-allocated with extra capacity for future row additions. + When \code{n == 0}, columns are shrunk to exact size to free unused memory. This operation modifies \code{DT} by reference. } \value{ \code{truelength(x)} returns the length of the vector allocated in memory. \code{length(x)} of those items are in use. Currently, it is just the list vector of column @@ -43,6 +53,8 @@ alloc.col(DT, \code{setalloccol} \emph{reallocates} \code{DT} by reference. This may be useful for efficiency if you know you are about to going to add a lot of columns in a loop. It also returns the new \code{DT}, for convenience in compound queries. + + \code{setallocrow} modifies \code{DT} by reference to ensure all columns are resizable. } \seealso{ \code{\link{copy}} } \examples{ diff --git a/src/assign.c b/src/assign.c index 849cb08f2a..4d262ca86f 100644 --- a/src/assign.c +++ b/src/assign.c @@ -496,7 +496,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"), i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols)); } - thisvalue = copyAsPlain(thisvalue); // PROTECT not needed as assigned as element to protected list below. + thisvalue = copyAsPlain(thisvalue, -1); // PROTECT not needed as assigned as element to protected list below. } else { if (verbose) Rprintf(_("Direct plonk of unnamed RHS, no copy. MAYBE_REFERENCED==%d, MAYBE_SHARED==%d\n"), MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue)); // e.g. DT[,a:=as.character(a)] as tested by 754.5 } diff --git a/src/coalesce.c b/src/coalesce.c index 10b7b77576..cd07581093 100644 --- a/src/coalesce.c +++ b/src/coalesce.c @@ -52,7 +52,7 @@ SEXP coalesce(SEXP x, SEXP inplaceArg, SEXP nan_is_na_arg) { error(_("Item %d is length %d but the first item is length %d. Only singletons are recycled."), i+2, length(item), nrow); } if (!inplace) { - first = PROTECT(copyAsPlain(first)); nprotect++; + first = PROTECT(copyAsPlain(first, -1)); nprotect++; if (verbose) Rprintf(_("coalesce copied first item (inplace=FALSE)\n")); } const void **valP = (const void **)R_alloc(nval, sizeof(*valP)); diff --git a/src/data.table.h b/src/data.table.h index 434d0a340a..79f27e8b8c 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -183,6 +183,10 @@ void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA); SEXP subsetVector(SEXP x, SEXP idx); const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out); +// deleterows.c +SEXP deleteRows(SEXP dt, SEXP rows_to_delete); +SEXP allocrow(SEXP dt, R_xlen_t n); + // fcast.c SEXP int_vec_init(R_len_t n, int val); @@ -321,7 +325,7 @@ SEXP fitsInInt64R(SEXP x); bool allNA(SEXP x, bool errorForBadType); SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); bool INHERITS(SEXP x, SEXP char_); -SEXP copyAsPlain(SEXP x); +SEXP copyAsPlain(SEXP x, R_xlen_t overalloc); void copySharedColumns(SEXP x); SEXP lock(SEXP x); SEXP unlock(SEXP x); @@ -394,6 +398,7 @@ SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP copy(SEXP); SEXP setdt_nrows(SEXP); SEXP alloccolwrapper(SEXP, SEXP, SEXP); +SEXP allocrowwrapper(SEXP, SEXP); SEXP selfrefokwrapper(SEXP, SEXP); SEXP truelength(SEXP); SEXP setcharvec(SEXP, SEXP, SEXP); diff --git a/src/deleterows.c b/src/deleterows.c new file mode 100644 index 0000000000..31192e8095 --- /dev/null +++ b/src/deleterows.c @@ -0,0 +1,231 @@ +#include "data.table.h" + +static void computePrefixSum(const int *keep, int *dest, R_xlen_t n, int nthreads); +static void compactVectorRaw(SEXP col, const int *dest, const int *keep, R_xlen_t new_nrow, R_xlen_t old_nrow); + +// Helper function to make columns resizable for delete by reference +SEXP allocrow(SEXP dt, R_xlen_t n) { + if (!INHERITS(dt, char_datatable)) + error(_("input to allocrow is not a data.table")); + + if (n < 0) + error(_("n must be non-negative in allocrow")); + + if (!xlength(dt)) return dt; // zero-column data.table + + const bool verbose = GetVerbose(); + int n_modified = 0; + + for (R_xlen_t i = 0; i < length(dt); i++) { + SEXP col = VECTOR_ELT(dt, i); + if (!isVector(col)) + error("Cannot make non-vector column %lld resizable", (long long)(i + 1)); + + const R_xlen_t currentLength = length(col); + const R_xlen_t currentCapacity = R_isResizable(col) ? R_maxLength(col) : currentLength; + const R_xlen_t targetCapacity = currentLength + n; + + // Only reallocate if not resizable, or capacity differs from target + if (!R_isResizable(col) || currentCapacity != targetCapacity) { + SEXP newcol = PROTECT(copyAsPlain(col, n)); + SET_VECTOR_ELT(dt, i, newcol); + UNPROTECT(1); + n_modified++; + } + } + + if (verbose) { + if (n_modified > 0) { + if (n > 0) { + Rprintf(Pl_(n_modified, + "Modified %d column (overallocated %lld rows)\n", + "Modified %d columns (overallocated %lld rows)\n"), + n_modified, (long long)n); + } else { + Rprintf(Pl_(n_modified, + "Modified %d column (shrunk to exact size)\n", + "Modified %d columns (shrunk to exact size)\n"), + n_modified); + } + } else { + Rprintf(_("allocrow had no effect, all columns already at target size\n")); + } + } + + return dt; +} + +SEXP deleteRows(SEXP dt, SEXP rows_to_delete) { + if (!isNewList(dt)) + error("Internal error: deleteRows received non-list dt"); // #nocov + if (!xlength(dt)) return dt; // zero-column data.table + + const R_xlen_t ncol = length(dt); + const R_xlen_t old_nrow = length(VECTOR_ELT(dt, 0)); + int nprotect = 0; + + if (old_nrow == 0) return dt; + + if (!isInteger(rows_to_delete) && !isLogical(rows_to_delete)) + internal_error(__func__, "rows_to_delete must be logical, integer, or numeric"); // #nocov + + int *keep = (int *)R_alloc(old_nrow, sizeof(int)); + const R_xlen_t n = length(rows_to_delete); + for (R_xlen_t i = 0; i < old_nrow; i++) keep[i] = 1; + int *idx = INTEGER(rows_to_delete); + for (R_xlen_t j = 0; j < n; j++) { + if (idx[j] == NA_INTEGER) continue; + // should be checked from irows in [ + if (idx[j] < 1 || idx[j] > old_nrow) internal_error(__func__, "Row index %d out of range [1, %lld]", idx[j], (long long)old_nrow); //# nocov + keep[idx[j] - 1] = 0; + } + + R_xlen_t new_nrow = 0; + for (R_xlen_t i = 0; i < old_nrow; i++) new_nrow += keep[i]; + if (new_nrow == old_nrow) return dt; + + int *dest = (int *)R_alloc(old_nrow, sizeof(int)); + const int nthreads = getDTthreads(old_nrow, true); + computePrefixSum(keep, dest, old_nrow, nthreads); + + // Compact each column + for (R_xlen_t j = 0; j < ncol; j++) { + SEXP col = VECTOR_ELT(dt, j); + if (!R_isResizable(col)) { + // catered for ALTREP above + SEXP newcol = PROTECT(copyAsPlain(col, 0)); nprotect++; + SET_VECTOR_ELT(dt, j, newcol); + col = newcol; + } + compactVectorRaw(col, dest, keep, new_nrow, old_nrow); + R_resizeVector(col, new_nrow); + SET_VECTOR_ELT(dt, j, col); + } + + SEXP rownames = PROTECT(getAttrib(dt, R_RowNamesSymbol)); nprotect++; + if (!isNull(rownames)) { + // create them from scratch like in dogroups or subset to avoid R internal issues + SEXP rn = PROTECT(allocVector(INTSXP, 2)); nprotect++; + INTEGER(rn)[0] = NA_INTEGER; + INTEGER(rn)[1] = -(int)new_nrow; + setAttrib(dt, R_RowNamesSymbol, rn); + } + + // Clear key and indices + setAttrib(dt, install("sorted"), R_NilValue); + setAttrib(dt, install("index"), R_NilValue); + + UNPROTECT(nprotect); + return dt; +} + +// Parallel prefix sum (exclusive scan) +// Two-pass algorithm: first count per thread, then scan, then local prefix sum +static void computePrefixSum(const int *keep, int *dest, R_xlen_t n, int nthreads) { + if (nthreads == 1) { + // Sequential version + int sum = 0; + for (R_xlen_t i = 0; i < n; i++) { + dest[i] = sum; + sum += keep[i]; + } + return; + } + + // Parallel version with two passes + int *thread_counts = (int *)R_alloc(nthreads, sizeof(int)); + + // Pass 1: Count keeps per thread + #pragma omp parallel num_threads(nthreads) + { + const int tid = omp_get_thread_num(); + const R_xlen_t chunk_size = (n + nthreads - 1) / nthreads; + const R_xlen_t start = tid * chunk_size; + const R_xlen_t end = (start + chunk_size > n) ? n : start + chunk_size; + + int local_count = 0; + for (R_xlen_t i = start; i < end; i++) { + local_count += keep[i]; + } + thread_counts[tid] = local_count; + } + + // Sequential scan of thread counts to get offsets + int *thread_offsets = (int *)R_alloc(nthreads, sizeof(int)); + thread_offsets[0] = 0; + for (int t = 1; t < nthreads; t++) { + thread_offsets[t] = thread_offsets[t-1] + thread_counts[t-1]; + } + + // Pass 2: Compute local prefix sum with offset + #pragma omp parallel num_threads(nthreads) + { + const int tid = omp_get_thread_num(); + const R_xlen_t chunk_size = (n + nthreads - 1) / nthreads; + const R_xlen_t start = tid * chunk_size; + const R_xlen_t end = (start + chunk_size > n) ? n : start + chunk_size; + + int local_sum = thread_offsets[tid]; + for (R_xlen_t i = start; i < end; i++) { + dest[i] = local_sum; + local_sum += keep[i]; + } + } +} + +#define COMPACT(CTYPE, ACCESSOR) { \ + CTYPE *p = ACCESSOR(col); \ + R_xlen_t i = 0; \ + while (i < old_nrow) { \ + if (!keep[i]) { \ + i++; \ + continue; \ + } \ + R_xlen_t run_start = i; \ + int target_idx = dest[i]; \ + while (i < old_nrow && keep[i]) i++; \ + size_t run_len = i - run_start; \ + if (target_idx != run_start) { \ + memmove(p + target_idx, p + run_start, run_len * sizeof(CTYPE)); \ + } \ + } \ +} + + +// Type-specific stream compaction +static void compactVectorRaw(SEXP col, const int *dest, const int *keep, + R_xlen_t new_nrow, R_xlen_t old_nrow) { + switch(TYPEOF(col)) { + case INTSXP: + case LGLSXP: { + COMPACT(int, INTEGER); + break; + } + case REALSXP: { + COMPACT(double, REAL); + break; + } + case CPLXSXP: { + COMPACT(Rcomplex, COMPLEX); + break; + } + case RAWSXP: { + COMPACT(Rbyte, RAW); + break; + } + case STRSXP: { + for (R_xlen_t i = 0; i < old_nrow; i++) { + if (keep[i]) SET_STRING_ELT(col, dest[i], STRING_ELT(col, i)); + } + break; + } + case VECSXP: { + for (R_xlen_t i = 0; i < old_nrow; i++) { + if (keep[i]) SET_VECTOR_ELT(col, dest[i], VECTOR_ELT(col, i)); + } + break; + } + default: + error("Unsupported column type %s", type2char(TYPEOF(col))); + } +} diff --git a/src/dogroups.c b/src/dogroups.c index 5200d33f36..b10f8398f9 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -334,7 +334,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX target = VECTOR_ELT(dt, colj); bool copied = false; if (isNewList(target) && anySpecialStatic(RHS, specials)) { // see comments in anySpecialStatic() - RHS = PROTECT(copyAsPlain(RHS)); + RHS = PROTECT(copyAsPlain(RHS, -1)); copied = true; } const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, -1, 0, ""); @@ -440,7 +440,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } bool copied = false; if (isNewList(target) && anySpecialStatic(source, specials)) { // see comments in anySpecialStatic() - source = PROTECT(copyAsPlain(source)); + source = PROTECT(copyAsPlain(source, -1)); copied = true; } memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, ""); diff --git a/src/fmelt.c b/src/fmelt.c index d6843c3ace..98de8ff6e7 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -811,7 +811,7 @@ SEXP fmelt(SEXP DT, SEXP id, SEXP measure, SEXP varfactor, SEXP valfactor, SEXP // edge case no measure.vars if (!data.lmax) { SEXP tt = PROTECT(shallowwrapper(DT, data.idcols)); protecti++; - ans = PROTECT(copyAsPlain(tt)); protecti++; + ans = PROTECT(copyAsPlain(tt, -1)); protecti++; } else { ansvals = PROTECT(getvaluecols(DT, dtnames, LOGICAL(valfactor)[0], verbose, &data)); protecti++; ansvars = PROTECT(getvarcols(DT, dtnames, LOGICAL(varfactor)[0], verbose, &data)); protecti++; diff --git a/src/init.c b/src/init.c index 13421998b4..69ae5a28fa 100644 --- a/src/init.c +++ b/src/init.c @@ -95,6 +95,8 @@ static const R_CallMethodDef callMethods[] = { {"CconvertNegAndZeroIdx", (DL_FUNC)&convertNegAndZeroIdx, -1}, {"Cfrank", (DL_FUNC)&frank, -1}, {"Cdt_na", (DL_FUNC)&dt_na, -1}, + {"CdeleteRows", (DL_FUNC)&deleteRows, 2}, + {"Callocrowwrapper", (DL_FUNC)&allocrowwrapper, 2}, {"Clookup", (DL_FUNC)&lookup, -1}, {"Coverlaps", (DL_FUNC)&overlaps, -1}, {"Cwhichwrapper", (DL_FUNC)&whichwrapper, -1}, diff --git a/src/reorder.c b/src/reorder.c index 8fe682e861..61a65b20dc 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -24,7 +24,7 @@ SEXP reorder(SEXP x, SEXP order) error(_("Column %d is length %d which differs from length of column 1 (%d). Invalid data.table."), i+1, length(v), nrow); if (RTYPE_SIZEOF(v) > maxSize) maxSize=RTYPE_SIZEOF(v); - if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v)); + if (ALTREP(v)) SET_VECTOR_ELT(x, i, copyAsPlain(v, -1)); } copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768 } else { @@ -40,7 +40,7 @@ SEXP reorder(SEXP x, SEXP order) if (length(order) != nrow) error("nrow(x)[%d]!=length(order)[%d]", nrow, length(order)); // # notranslate int nprotect = 0; - if (ALTREP(order)) { order=PROTECT(copyAsPlain(order)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand + if (ALTREP(order)) { order=PROTECT(copyAsPlain(order, -1)); nprotect++; } // TODO: if it's an ALTREP sequence some optimizations are possible rather than expand const int *restrict idx = INTEGER_RO(order); int i=0; diff --git a/src/subset.c b/src/subset.c index b3372f057b..e7b0eeefb1 100644 --- a/src/subset.c +++ b/src/subset.c @@ -313,7 +313,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) { // API change needs update NEWS.md for (int i=0; i= 0: resizable vector with capacity = length(x) + overalloc if (isNull(x)) { // deal with up front because isNewList(R_NilValue) is true @@ -227,7 +231,14 @@ SEXP copyAsPlain(SEXP x) { return duplicate(x); } const int64_t n = XLENGTH(x); - SEXP ans = PROTECT(allocVector(TYPEOF(x), n)); + SEXP ans; + if (overalloc == -1) { + ans = PROTECT(allocVector(TYPEOF(x), n)); + } else { + const R_xlen_t capacity = n + overalloc; + ans = PROTECT(R_allocResizableVector(TYPEOF(x), capacity)); + R_resizeVector(ans, n); + } // aside: unlike R's duplicate we do not copy truelength here; important for dogroups.c which uses negative truelenth to mark its specials if (ALTREP(ans)) internal_error(__func__, "copyAsPlain returning ALTREP for type '%s'", type2char(TYPEOF(x))); // # nocov @@ -258,7 +269,7 @@ SEXP copyAsPlain(SEXP x) { } break; case VECSXP: { const SEXP *xp=SEXPPTR_RO(x); - for (int64_t i=0; i