-
Notifications
You must be signed in to change notification settings - Fork 1k
delete rows by reference #7536
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
delete rows by reference #7536
Changes from all commits
9a278cd
6d2182a
b9e76eb
703456a
6f4867c
67d6505
4888272
8d5e869
4936003
e026e8d
1309d52
57a4bdc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21959,3 +21959,96 @@ test(2355.1, fread(txt, skip=0), data.table(V1 = c("b1", "c1"), a1 | |
| test(2355.2, fread(txt, skip=0, header=TRUE), data.table(V1 = c("b1", "c1"), a1 = c("b2", "c2"), a2 = c("b3", "c3")), warning="Added an extra default column name") | ||
| test(2355.3, fread(txt, skip=0, header=FALSE), data.table(V1=character(), V2=character(), V3=character()), warning="Consider fill=TRUE") | ||
| test(2355.4, fread(txt, skip=0, fill=TRUE), data.table(V1 = c("a1", "b1", "c1"), V2 = c("a2", "b2", "c2"), V3 = c("", "b3", "c3"))) | ||
|
|
||
| # delete rows by reference #635 | ||
| # atomic types and list columns | ||
| dt = data.table( | ||
| int = 1:5, | ||
| real = c(1.1, 2.2, 3.3, 4.4, 5.5), | ||
| char = letters[1:5], | ||
| lgl = c(TRUE, FALSE, TRUE, FALSE, TRUE), | ||
| cplx = as.complex(1:5), | ||
| raw_col = as.raw(1:5), | ||
| list_col = list(1L, 1:2, 1:3, 1:4, 1:5) | ||
| ) | ||
| test(2356.01, copy(dt)[1L, .ROW := NULL], dt[-1]) | ||
| test(2356.02, copy(dt)[1, .ROW := NULL], dt[-1]) | ||
| test(2356.03, copy(dt)[c(TRUE, FALSE, FALSE, TRUE, FALSE), .ROW := NULL], dt[-c(1,4)]) | ||
| test(2356.04, copy(dt)[int==1L, .ROW := NULL], dt[-1]) | ||
| test(2356.05, copy(dt)[int<2L, .ROW := NULL], dt[-1]) | ||
| test(2356.06, copy(dt)[-1, .ROW := NULL], dt[1]) | ||
| # zero row or empty data.tables | ||
| dt = data.table() | ||
| test(2356.07, dt[logical(0), .ROW := NULL], dt) | ||
| dt = data.table(a=integer(0), b=character(0)) | ||
| test(2356.08, dt[logical(0), .ROW := NULL], dt) | ||
| # multirow | ||
| dt = data.table(a=1:5, b=letters[1:5]) | ||
| test(2356.09, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)]) | ||
| test(2356.10, copy(dt)[c(TRUE, FALSE, TRUE, FALSE, TRUE), .ROW := NULL], dt[c(2,4)]) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this looks the same as 2356.03? |
||
| test(2356.11, copy(dt)[1:2, .ROW := NULL], dt[3:5]) | ||
| test(2356.12, copy(dt)[1:5, .ROW := NULL], dt[0]) | ||
| # NA handling and edges case | ||
| dt = data.table(a=1:5, b=letters[1:5]) | ||
| test(2356.13, copy(dt)[c(1L, NA_integer_, 3L), .ROW := NULL], dt[c(2,4,5)]) | ||
| test(2356.14, copy(dt)[c(NA_integer_, NA_integer_), .ROW := NULL], dt) | ||
| test(2356.15, copy(dt)[c(TRUE, NA, FALSE, NA, TRUE), .ROW := NULL], dt[c(2,3,4)]) | ||
| test(2356.16, copy(dt)[integer(0), .ROW := NULL], dt) | ||
| test(2356.17, copy(dt)[logical(0), .ROW := NULL], dt) | ||
| test(2356.18, copy(dt)[c(FALSE, FALSE, FALSE, FALSE, FALSE), .ROW := NULL], dt) | ||
| test(2356.19, copy(dt)[a > 100, .ROW := NULL], dt) # no matches | ||
| # Duplicate indices | ||
| dt = data.table(a=1:5, b=letters[1:5]) | ||
| test(2356.20, copy(dt)[c(1L, 1L), .ROW := NULL], dt[-1]) | ||
| test(2356.21, copy(dt)[c(1L, 1L, 2L, 2L), .ROW := NULL], dt[3:5]) | ||
| test(2356.22, copy(dt)[c(3L, 1L, 3L, 1L), .ROW := NULL], dt[c(2,4,5)]) | ||
| # integer64 | ||
| if (test_bit64) { | ||
| dt = data.table(a=1:5, b=as.integer64(11:15)) | ||
| test(2356.23, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)]) | ||
| test(2356.24, copy(dt)[1:5, .ROW := NULL], data.table(a=integer(0), b=integer64(0))) | ||
| } | ||
| # Date/IDate/ITime columns | ||
| dt = data.table(a=1:5, d=as.Date("2024-01-01") + 0:4, t=as.ITime(paste0(10:14, ":00:00")), dt=as.POSIXct("2024-01-01 12:00:00") + 3600*0:4) | ||
| test(2356.25, copy(dt)[c(1L, 3L), .ROW := NULL], dt[c(2,4,5)]) | ||
| test(2356.26, copy(dt)[c(2L, 4L), .ROW := NULL]$d, as.Date("2024-01-01") + c(0,2,4)) | ||
| # Factor columns | ||
| dt = data.table(a=1:5, f=factor(letters[1:5], levels=letters[1:10])) | ||
| test(2356.27, copy(dt)[c(1L, 3L), .ROW := NULL], dt[-c(1L,3L)]) | ||
| test(2356.28, levels(copy(dt)[c(1L, 3L), .ROW := NULL]$f), letters[1:10]) | ||
| dt = data.table(a=1:5, of=ordered(letters[1:5], levels=letters[5:1])) | ||
| test(2356.29, copy(dt)[c(2L, 4L), .ROW := NULL], dt[-c(2L,4L)]) | ||
| test(2356.30, is.ordered(copy(dt)[c(2, 4L), .ROW := NULL]$of)) | ||
| # Keys - should be cleared after deletion | ||
| dt = data.table(a=5:1, b=letters[1:5], key="a") | ||
| test(2356.31, key(copy(dt)[1L, .ROW := NULL]), NULL) | ||
| test(2356.32, haskey(copy(dt)[1L, .ROW := NULL]), FALSE) | ||
| # Indices - should be cleared after deletion | ||
| dt = data.table(a=1:5, b=letters[1:5], c=5:1) | ||
| setindex(dt, b) | ||
| test(2356.33, indices(copy(dt)[1L, .ROW := NULL]), NULL) | ||
| # row names | ||
| dt = data.table(a=1:5, b=letters[1:5]) | ||
| test(2356.34, attr(copy(dt)[c(1L, 3L), .ROW := NULL], "row.names"), 1:3) | ||
| # selfref check | ||
| test(2356.35, selfrefok(copy(dt)[1L, .ROW := NULL]), 1L) | ||
| # errors | ||
| dt = data.table(a=1:4, g=1:2) | ||
| test(2356.36, dt[1L, .ROW := 1L], error=".ROW can only be used with := NULL") | ||
| test(2356.37, dt[1L, .ROW := "delete"], error=".ROW can only be used with := NULL") | ||
| test(2356.38, dt[1L, .ROW := FALSE], error=".ROW can only be used with := NULL") | ||
| test(2356.39, dt[, .ROW := NULL], error=".ROW := NULL requires i= condition") | ||
| test(2356.40, dt[1L, .ROW := NULL, by=g], error=".ROW := NULL with 'by' or 'keyby' is not supported") | ||
| # large table | ||
| dt = data.table(a=1:20000, b=rep(letters, length.out=20000)) | ||
| idx = seq(1L, 20000L, by=2L) | ||
| test(2356.41, copy(dt)[idx, .ROW := NULL], dt[-idx]) | ||
| # Chaining and complexer i expressions | ||
| dt = data.table(a=1:10, b=letters[1:10]) | ||
| test(2356.42, copy(dt)[a>2, .ROW := NULL][b=="a"], data.table(a=1L, b="a")) | ||
| test(2356.43, copy(dt)[a %% 2 == 0, .ROW := NULL], dt[a %% 2 != 0]) | ||
| test(2356.44, copy(dt)[!(a < 5 & b != "d"), .ROW := NULL], dt[1:3]) | ||
| # make columns resizable | ||
| dt = data.table(a=1:3) | ||
| test(2356.91, truelength(dt$a), 0L) | ||
| test(2356.92, {setallocrow(dt); truelength(dt$a)}, 3L) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,12 +56,15 @@ set(x, i = NULL, j, value) | |
| DT[i, colC := mean(colB), by = colA] # update (or add) column called "colC" by reference by group. A major feature of `:=`. | ||
| DT[,`:=`(new1 = sum(colB), new2 = sum(colC))] # Functional form | ||
| DT[, let(new1 = sum(colB), new2 = sum(colC))] # New alias for functional form. | ||
| DT[i, .ROW := NULL] # delete rows by reference. | ||
| } | ||
|
|
||
| The \code{\link{.Last.updated}} variable contains the number of rows updated by the most recent \code{:=} or \code{set} calls, which may be useful, for example, in production settings for testing assumptions about the number of rows affected by a statement; see \code{\link{.Last.updated}} for details. | ||
|
|
||
| Note that for efficiency no check is performed for duplicate assignments, i.e. if multiple values are passed for assignment to the same index, assignment to this index will occur repeatedly and sequentially; for a given use case, consider whether it makes sense to create your own test for duplicates, e.g. in production code. | ||
|
|
||
| Note that \code{.ROW := NULL} is a special case used to delete rows by reference. Unlike column assignment, this requires an \code{i} expression to specify which rows to delete, and does not support \code{by} or \code{keyby}. See \code{\link{.ROW}} or \code{\link{special-symbols}} for details. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. aside for future consideration -- another use case would be
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Oh, I misunderstood |
||
|
|
||
| All of the following result in a friendly error (by design) : | ||
|
|
||
| \preformatted{ | ||
|
|
@@ -158,6 +161,13 @@ set(DT, j = c("b", "d"), value = list(200L, 300L)) | |
| ## Set values for multiple columns with multiple specified rows. | ||
| set(DT, c(1L, 3L), c("b", "d"), value = list(500L, 800L)) | ||
|
|
||
| # Delete rows by reference | ||
| DT = data.table(a=1:10, b=letters[1:10]) | ||
| DT[c(2,4,6), .ROW := NULL] # delete rows 2, 4, and 6 | ||
| DT | ||
| DT[a>5, .ROW := NULL] # delete rows where a>5 | ||
| DT | ||
|
|
||
| \dontrun{ | ||
| # Speed example: | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,10 +9,12 @@ | |
| \alias{.EACHI} | ||
| \alias{.NGRP} | ||
| \alias{.NATURAL} | ||
| \alias{.ROW} | ||
| \title{ Special symbols } | ||
| \description{ | ||
| \code{.SD}, \code{.BY}, \code{.N}, \code{.I}, \code{.GRP}, and \code{.NGRP} are \emph{read-only} symbols for use in \code{j}. \code{.N} can be used in \code{i} as well. \code{.I} can be used in \code{by} as well. See the vignettes, Details and Examples here and in \code{\link{data.table}}. | ||
| \code{.EACHI} is a symbol passed to \code{by}; i.e. \code{by=.EACHI}, \code{.NATURAL} is a symbol passed to \code{on}; i.e. \code{on=.NATURAL} | ||
| \code{.ROW} is a symbol used with \code{:= NULL} to delete rows by reference; i.e. \code{DT[i, .ROW := NULL]} deletes the rows selected by \code{i}. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe just point to |
||
| } | ||
| \details{ | ||
| The bindings of these variables are locked and attempting to assign to them will generate an error. If you wish to manipulate \code{.SD} before returning it, take a \code{\link{copy}(.SD)} first (see FAQ 4.5). Using \code{:=} in the \code{j} of \code{.SD} is reserved for future use as a (tortuously) flexible way to update \code{DT} by reference by group (even when groups are not contiguous in an ad hoc by). | ||
|
|
@@ -32,6 +34,8 @@ | |
|
|
||
| \code{.NATURAL} is defined as \code{NULL} but its value is not used. Its usage is \code{on=.NATURAL} (alternative of \code{X[on=Y]}) which joins two tables on their common column names, performing a natural join; see \code{\link{data.table}}'s \code{on} argument for more details. | ||
|
|
||
| \code{.ROW} is a symbol that can only be used with \code{:= NULL} to delete rows by reference. When you use \code{DT[i, .ROW := NULL]}, the rows matching the \code{i} expression are removed from \code{DT} in-place. This is an efficient way to delete rows without copying the entire data.table. The \code{i} argument is required and \code{by}/\code{keyby} are not supported. After deletion, any keys and indices on \code{DT} are cleared. See \code{\link{:=}} for more on reference semantics. | ||
|
|
||
| Note that \code{.N} in \code{i} is computed up-front, while that in \code{j} applies \emph{after filtering in \code{i}}. That means that even absent grouping, \code{.N} in \code{i} can be different from \code{.N} in \code{j}. See Examples. | ||
|
|
||
| Note also that you should consider these symbols read-only and of limited scope -- internal data.table code might manipulate them in unexpected ways, and as such their bindings are locked. There are subtle ways to wind up with the wrong object, especially when attempting to copy their values outside a grouping context. See examples; when in doubt, \code{copy()} is your friend. | ||
|
|
@@ -72,5 +76,12 @@ DT[, .(min(.SD[,-1])), by=.I] | |
| # Do not expect this to correctly append the value of .BY in each group; copy(.BY) will work. | ||
| by_tracker = list() | ||
| DT[, { append(by_tracker, .BY); sum(v) }, by=x] | ||
|
|
||
| # .ROW to delete rows by reference | ||
| DT = data.table(a=1:5, b=letters[1:5]) | ||
| DT[c(2,4), .ROW := NULL] | ||
| DT | ||
| DT[a>2, .ROW := NULL] | ||
| DT | ||
| } | ||
| \keyword{ data } | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |||||
| \alias{truelength} | ||||||
| \alias{setalloccol} | ||||||
| \alias{alloc.col} | ||||||
| \alias{setallocrow} | ||||||
| \title{ Over-allocation access } | ||||||
| \description{ | ||||||
| These functions are experimental and somewhat advanced. By \emph{experimental} we mean their names might change and perhaps the syntax, argument names and types. So if you write a lot of code using them, you have been warned! They should work and be stable, though, so please report problems with them. \code{alloc.col} is just an alias to \code{setalloccol}. We recommend to use \code{setalloccol} (though \code{alloc.col} will continue to be supported) because the \code{set*} prefix in \code{setalloccol} makes it clear that its input argument is modified in-place. | ||||||
|
|
@@ -14,11 +15,14 @@ setalloccol(DT, | |||||
| alloc.col(DT, | ||||||
| n = getOption("datatable.alloccol"), # default: 1024L | ||||||
| verbose = getOption("datatable.verbose")) # default: FALSE | ||||||
| setallocrow(DT, n = 0L) | ||||||
| } | ||||||
| \arguments{ | ||||||
| \item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. } | ||||||
| \item{DT}{ A \code{data.table}. } | ||||||
| \item{n}{ The number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. } | ||||||
| \item{n}{ For \code{setalloccol} and \code{alloc.col}: the number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. | ||||||
|
|
||||||
| For \code{setallocrow}: the number of rows to over-allocate. If \code{n > 0}, allocates capacity for current rows plus \code{n} additional rows. If \code{n == 0} (default), shrinks columns to exact current size to free excess memory. } | ||||||
| \item{verbose}{ Output status and information. } | ||||||
| } | ||||||
| \details{ | ||||||
|
|
@@ -34,6 +38,12 @@ alloc.col(DT, | |||||
| (perhaps in your .Rprofile); e.g., \code{options(datatable.alloccol=10000L)}. | ||||||
|
|
||||||
| Please note: over-allocation of the column pointer vector is not for efficiency \emph{per se}; it is so that \code{:=} can add columns by reference without a shallow copy. | ||||||
|
|
||||||
| \code{setallocrow} is a utility function that prepares columns for fast row operations (delete or insert (not implemented yet)) by reference and manages row capacity. | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. current wording doesn't make clear that delete is implemented, insert is not.
Suggested change
|
||||||
| Before deleting or inserting rows by reference, columns must be resizable. | ||||||
| \code{setallocrow} ensures all columns are in the appropriate state by converting ALTREP columns to materialized form and reallocating | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this needs a bit of clarification -- when I read it, my thought was "oh, we can't do OTOH, I think (?) we strive to expand all |
||||||
| columns to have the target capacity. When \code{n > 0}, columns are over-allocated with extra capacity for future row additions. | ||||||
| When \code{n == 0}, columns are shrunk to exact size to free unused memory. This operation modifies \code{DT} by reference. | ||||||
| } | ||||||
| \value{ | ||||||
| \code{truelength(x)} returns the length of the vector allocated in memory. \code{length(x)} of those items are in use. Currently, it is just the list vector of column | ||||||
|
|
@@ -43,6 +53,8 @@ alloc.col(DT, | |||||
|
|
||||||
| \code{setalloccol} \emph{reallocates} \code{DT} by reference. This may be useful for efficiency if you know you are about to going to add a lot of columns in a loop. | ||||||
| It also returns the new \code{DT}, for convenience in compound queries. | ||||||
|
|
||||||
| \code{setallocrow} modifies \code{DT} by reference to ensure all columns are resizable. | ||||||
| } | ||||||
| \seealso{ \code{\link{copy}} } | ||||||
| \examples{ | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggest unnesting: