Skip to content

Commit 205ceb9

Browse files
authored
GH-34775: [R] arrow_table: as.data.frame() sometimes returns a tbl and sometimes a data.frame (#35173)
Features of this PR: * Ensures that calling `as.data.frame()` on Arrow objects returns base R `data.frame` objects. * Drops the `class` attribute metadata of input objects of `data.frame` class (i.e. that don't have inherit from any additional classes other than `data.frame`). This results in us sacrificing roundtrip class fidelity for `data.frame` objects (i.e. if we input a base R data.frame, convert it to an Arrow Table, and then convert it back to R, we get a tibble). However, we now have consistency in the type of returned objects, retain roundtrip fidelity for other (non-class) metadata, and guarantee that `as.data.frame()` returns a base R data.frame. Users who wish to input and return a `data.frame` object can call `as.data.frame()` on the returned object. * Implements `dplyr::collect()` for StructArrays so that these objects can still be returned as tibbles if needed. * Renames `expect_data_frame()` to `expect_equal_data_frame()` for clarity, and updates it to convert both the object and expected object to data.frames. * Closes: #34775 Authored-by: Nic Crane <thisisnic@gmail.com> Signed-off-by: Nic Crane <thisisnic@gmail.com>
1 parent 2ee0345 commit 205ceb9

27 files changed

Lines changed: 229 additions & 184 deletions

r/R/array.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ dim.StructArray <- function(x, ...) c(length(x), x$type$num_fields)
474474

475475
#' @export
476476
as.data.frame.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
477-
as.vector(x)
477+
as.data.frame(collect.StructArray(x), row.names = row.names, optional = optional, ...)
478478
}
479479

480480
#' @rdname array

r/R/arrow-tabular.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ ArrowTabular <- R6Class("ArrowTabular",
9494
#' @export
9595
as.data.frame.ArrowTabular <- function(x, row.names = NULL, optional = FALSE, ...) {
9696
df <- x$to_data_frame()
97-
apply_arrow_r_metadata(df, x$metadata$r)
97+
out <- apply_arrow_r_metadata(df, x$metadata$r)
98+
as.data.frame(out, row.names = row.names, optional = optional, ...)
9899
}
99100

100101
#' @export

r/R/csv.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ read_delim_arrow <- function(file,
248248
}
249249

250250
if (isTRUE(as_data_frame)) {
251-
tab <- as.data.frame(tab)
251+
tab <- collect.ArrowTabular(tab)
252252
}
253253

254254
tab

r/R/dplyr-collect.R

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
2424
}
2525
collect.ArrowTabular <- function(x, as_data_frame = TRUE, ...) {
2626
if (as_data_frame) {
27-
as.data.frame(x, ...)
27+
df <- x$to_data_frame()
28+
apply_arrow_r_metadata(df, x$metadata$r)
2829
} else {
2930
x
3031
}
@@ -34,6 +35,10 @@ collect.Dataset <- function(x, as_data_frame = TRUE, ...) {
3435
}
3536
collect.RecordBatchReader <- collect.Dataset
3637

38+
collect.StructArray <- function(x, row.names = NULL, optional = FALSE, ...) {
39+
as.vector(x)
40+
}
41+
3742
compute.ArrowTabular <- function(x, ...) x
3843
compute.arrow_dplyr_query <- function(x, ...) {
3944
# TODO: should this tryCatch move down into as_arrow_table()?

r/R/dplyr.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,8 @@ unique.RecordBatchReader <- unique.arrow_dplyr_query
216216

217217
#' @export
218218
as.data.frame.arrow_dplyr_query <- function(x, row.names = NULL, optional = FALSE, ...) {
219-
collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
219+
out <- collect.arrow_dplyr_query(x, as_data_frame = TRUE, ...)
220+
as.data.frame(out)
220221
}
221222

222223
#' @export

r/R/feather.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T
196196
)
197197

198198
if (isTRUE(as_data_frame)) {
199-
out <- as.data.frame(out)
199+
df <- out$to_data_frame()
200+
out <- apply_arrow_r_metadata(df, out$metadata$r)
200201
}
201202
out
202203
}

r/R/ipc-stream.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ read_ipc_stream <- function(file, as_data_frame = TRUE, ...) {
106106
# https://issues.apache.org/jira/browse/ARROW-6830
107107
out <- RecordBatchStreamReader$create(file)$read_table()
108108
if (as_data_frame) {
109-
out <- as.data.frame(out)
109+
out <- collect.ArrowTabular(out)
110110
}
111111
out
112112
}

r/R/json.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ read_json_arrow <- function(file,
8484
}
8585

8686
if (isTRUE(as_data_frame)) {
87-
tab <- as.data.frame(tab)
87+
tab <- collect.ArrowTabular(tab)
8888
}
8989
tab
9090
}

r/R/metadata.R

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@
2222
# drop problems attributes (most likely from readr)
2323
x[["attributes"]][["problems"]] <- NULL
2424

25+
# remove the class if it's just data.frame
26+
if (identical(x$attributes$class, "data.frame")) {
27+
x$attributes <- x$attributes[names(x$attributes) != "class"]
28+
if (is_empty(x$attributes)) {
29+
x <- x[names(x) != "attributes"]
30+
}
31+
}
32+
2533
out <- serialize(x, NULL, ascii = TRUE)
2634

2735
# if the metadata is over 100 kB, compress
@@ -62,6 +70,7 @@ apply_arrow_r_metadata <- function(x, r_metadata) {
6270
expr = {
6371
columns_metadata <- r_metadata$columns
6472
if (is.data.frame(x)) {
73+
# if columns metadata exists, apply it here
6574
if (length(names(x)) && !is.null(columns_metadata)) {
6675
for (name in intersect(names(columns_metadata), names(x))) {
6776
x[[name]] <- apply_arrow_r_metadata(x[[name]], columns_metadata[[name]])

r/R/parquet.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ read_parquet <- function(file,
7070
}
7171

7272
if (as_data_frame) {
73-
tab <- as.data.frame(tab)
73+
tab <- collect.ArrowTabular(tab)
7474
}
7575
tab
7676
}

0 commit comments

Comments
 (0)