diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index 08af1c79769..c02c27422bc 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -300,6 +301,308 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, } } +struct GArrowCSVWriteOptionsPrivate +{ + arrow::csv::WriteOptions write_options; +}; + +enum { + PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER = 1, + PROP_CSV_WRITE_OPTIONS_BATCH_SIZE, + PROP_CSV_WRITE_OPTIONS_DELIMITER, + PROP_CSV_WRITE_OPTIONS_NULL_STRING, + PROP_CSV_WRITE_OPTIONS_EOL, + PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE, + PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowCSVWriteOptions, garrow_csv_write_options, G_TYPE_OBJECT) + +#define GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + garrow_csv_write_options_get_instance_private(GARROW_CSV_WRITE_OPTIONS(object))) + +static void +garrow_csv_write_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER: + priv->write_options.include_header = g_value_get_boolean(value); + break; + case PROP_CSV_WRITE_OPTIONS_BATCH_SIZE: + priv->write_options.batch_size = g_value_get_int(value); + break; + case PROP_CSV_WRITE_OPTIONS_DELIMITER: + priv->write_options.delimiter = g_value_get_schar(value); + break; + case PROP_CSV_WRITE_OPTIONS_NULL_STRING: + priv->write_options.null_string = g_value_get_string(value); + break; + case PROP_CSV_WRITE_OPTIONS_EOL: + priv->write_options.eol = g_value_get_string(value); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE: + priv->write_options.quoting_style = + static_cast(g_value_get_enum(value)); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER: + priv->write_options.quoting_header = + static_cast(g_value_get_enum(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_csv_write_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER: + g_value_set_boolean(value, priv->write_options.include_header); + break; + case PROP_CSV_WRITE_OPTIONS_BATCH_SIZE: + g_value_set_int(value, priv->write_options.batch_size); + break; + case PROP_CSV_WRITE_OPTIONS_DELIMITER: + g_value_set_schar(value, priv->write_options.delimiter); + break; + case PROP_CSV_WRITE_OPTIONS_NULL_STRING: + g_value_set_string(value, priv->write_options.null_string.c_str()); + break; + case PROP_CSV_WRITE_OPTIONS_EOL: + g_value_set_string(value, priv->write_options.eol.c_str()); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE: + g_value_set_enum( + value, + static_cast(priv->write_options.quoting_style)); + break; + case PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER: + g_value_set_enum( + value, + static_cast(priv->write_options.quoting_header)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_csv_write_options_init(GArrowCSVWriteOptions *object) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(object); + priv->write_options = arrow::csv::WriteOptions::Defaults(); +} + +static void +garrow_csv_write_options_class_init(GArrowCSVWriteOptionsClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = garrow_csv_write_options_set_property; + gobject_class->get_property = garrow_csv_write_options_get_property; + + auto write_options = arrow::csv::WriteOptions::Defaults(); + + /** + * GArrowCSVWriteOptions:include-header: + * + * Whether to write an initial header line with column names. + * + * Since: 23.0.0 + */ + spec = g_param_spec_boolean("include-header", + "Include header", + "Whether to write an initial header line with column names", + write_options.include_header, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_INCLUDE_HEADER, + spec); + + /** + * GArrowCSVWriteOptions:batch-size: + * + * Maximum number of rows processed at a time. + * + * The CSV writer converts and writes data in batches of N rows. This number can impact + * performance. + * + * Since: 23.0.0 + */ + spec = g_param_spec_int("batch-size", + "Batch size", + "Maximum number of rows processed at a time", + 1, + G_MAXINT32, + write_options.batch_size, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_BATCH_SIZE, spec); + + /** + * GArrowCSVWriteOptions:delimiter: + * + * Field delimiter. + * + * Since: 23.0.0 + */ + spec = g_param_spec_char("delimiter", + "Delimiter", + "Field delimiter", + 0, + G_MAXINT8, + write_options.delimiter, + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_DELIMITER, spec); + + /** + * GArrowCSVWriteOptions:null-string: + * + * The string to write for null values. Quotes are not allowed in this string. + * + * Since: 23.0.0 + */ + spec = g_param_spec_string("null-string", + "Null string", + "The string to write for null values", + "", + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_NULL_STRING, + spec); + + /** + * GArrowCSVWriteOptions:eol: + * + * The end of line character to use for ending rows. + * + * Since: 23.0.0 + */ + spec = g_param_spec_string("eol", + "EOL", + "The end of line character to use for ending rows", + "\n", + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_CSV_WRITE_OPTIONS_EOL, spec); + + /** + * GArrowCSVWriteOptions:quoting-style: + * + * Quoting style. + * + * Since: 23.0.0 + */ + spec = + g_param_spec_enum("quoting-style", + "Quoting style", + "Quoting style", + GARROW_TYPE_CSV_QUOTING_STYLE, + static_cast(write_options.quoting_style), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_QUOTING_STYLE, + spec); + + /** + * GArrowCSVWriteOptions:quoting-header: + * + * Quoting style of header. + * + * Note that #GARROW_CSV_QUOTING_STYLE_NEEDED and #GARROW_CSV_QUOTING_STYLE_ALL_VALID + * have the same effect of quoting all column names. + * + * Since: 23.0.0 + */ + spec = + g_param_spec_enum("quoting-header", + "Quoting header", + "Quoting style of header", + GARROW_TYPE_CSV_QUOTING_STYLE, + static_cast(write_options.quoting_header), + static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, + PROP_CSV_WRITE_OPTIONS_QUOTING_HEADER, + spec); +} + +/** + * garrow_csv_write_options_new: + * + * Returns: A newly created #GArrowCSVWriteOptions. + * + * Since: 23.0.0 + */ +GArrowCSVWriteOptions * +garrow_csv_write_options_new(void) +{ + auto csv_write_options = g_object_new(GARROW_TYPE_CSV_WRITE_OPTIONS, NULL); + return GARROW_CSV_WRITE_OPTIONS(csv_write_options); +} + +G_DEFINE_TYPE(GArrowCSVWriter, garrow_csv_writer, GARROW_TYPE_RECORD_BATCH_WRITER); + +static void +garrow_csv_writer_init(GArrowCSVWriter *object) +{ +} + +static void +garrow_csv_writer_class_init(GArrowCSVWriterClass *klass) +{ +} + +/** + * garrow_csv_writer_new: + * @sink: The output of the writer. + * @schema: The schema of the writer. + * @options: (nullable): Options for serialization. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowCSVWriter + * or %NULL on error. + * + * Since: 23.0.0 + */ +GArrowCSVWriter * +garrow_csv_writer_new(GArrowOutputStream *sink, + GArrowSchema *schema, + GArrowCSVWriteOptions *options, + GError **error) +{ + auto arrow_sink = garrow_output_stream_get_raw(sink); + auto arrow_schema = garrow_schema_get_raw(schema); + arrow::csv::WriteOptions arrow_write_options; + if (options) { + auto arrow_write_options_ptr = garrow_csv_write_options_get_raw(options); + arrow_write_options = *arrow_write_options_ptr; + } else { + arrow_write_options = arrow::csv::WriteOptions::Defaults(); + } + auto arrow_writer_result = + arrow::csv::MakeCSVWriter(arrow_sink, arrow_schema, arrow_write_options); + if (garrow::check(error, arrow_writer_result, "[csv-writer][new]")) { + auto arrow_writer = *arrow_writer_result; + return garrow_csv_writer_new_raw(&arrow_writer); + } else { + return NULL; + } +} + G_END_DECLS GArrowRecordBatchWriter * @@ -343,3 +646,18 @@ garrow_record_batch_file_writer_new_raw( NULL)); return writer; } + +GArrowCSVWriter * +garrow_csv_writer_new_raw(std::shared_ptr *arrow_writer) +{ + auto writer = GARROW_CSV_WRITER( + g_object_new(GARROW_TYPE_CSV_WRITER, "record-batch-writer", arrow_writer, NULL)); + return writer; +} + +arrow::csv::WriteOptions * +garrow_csv_write_options_get_raw(GArrowCSVWriteOptions *options) +{ + auto priv = GARROW_CSV_WRITE_OPTIONS_GET_PRIVATE(options); + return &priv->write_options; +} diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index cea8390d902..fc5fe0c2c73 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -94,4 +94,49 @@ garrow_record_batch_file_writer_new(GArrowOutputStream *sink, GArrowSchema *schema, GError **error); +/** + * GArrowCSVQuotingStyle: + * @GARROW_CSV_QUOTING_STYLE_NEEDED: Only enclose values in quotes which need them. + * @GARROW_CSV_QUOTING_STYLE_ALL_VALID: Enclose all valid values in quotes. + * @GARROW_CSV_QUOTING_STYLE_NONE: Do not enclose any values in quotes. + * + * They are corresponding to `arrow::csv::QuotingStyle` values. + * + * Since: 23.0.0 + */ +typedef enum { + GARROW_CSV_QUOTING_STYLE_NEEDED, + GARROW_CSV_QUOTING_STYLE_ALL_VALID, + GARROW_CSV_QUOTING_STYLE_NONE, +} GArrowCSVQuotingStyle; + +#define GARROW_TYPE_CSV_WRITE_OPTIONS (garrow_csv_write_options_get_type()) +GARROW_AVAILABLE_IN_23_0 +G_DECLARE_DERIVABLE_TYPE( + GArrowCSVWriteOptions, garrow_csv_write_options, GARROW, CSV_WRITE_OPTIONS, GObject) +struct _GArrowCSVWriteOptionsClass +{ + GObjectClass parent_class; +}; + +GARROW_AVAILABLE_IN_23_0 +GArrowCSVWriteOptions * +garrow_csv_write_options_new(void); + +#define GARROW_TYPE_CSV_WRITER (garrow_csv_writer_get_type()) +GARROW_AVAILABLE_IN_23_0 +G_DECLARE_DERIVABLE_TYPE( + GArrowCSVWriter, garrow_csv_writer, GARROW, CSV_WRITER, GArrowRecordBatchWriter) +struct _GArrowCSVWriterClass +{ + GArrowRecordBatchWriterClass parent_class; +}; + +GARROW_AVAILABLE_IN_23_0 +GArrowCSVWriter * +garrow_csv_writer_new(GArrowOutputStream *sink, + GArrowSchema *schema, + GArrowCSVWriteOptions *options, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp index 1d85ac52f88..553d83a3828 100644 --- a/c_glib/arrow-glib/writer.hpp +++ b/c_glib/arrow-glib/writer.hpp @@ -20,6 +20,8 @@ #pragma once #include +#include +#include #include #include @@ -42,3 +44,11 @@ GARROW_AVAILABLE_IN_ALL GArrowRecordBatchFileWriter * garrow_record_batch_file_writer_new_raw( std::shared_ptr *arrow_writer); + +GARROW_AVAILABLE_IN_23_0 +GArrowCSVWriter * +garrow_csv_writer_new_raw(std::shared_ptr *arrow_writer); + +GARROW_AVAILABLE_IN_23_0 +arrow::csv::WriteOptions * +garrow_csv_write_options_get_raw(GArrowCSVWriteOptions *options); diff --git a/c_glib/test/test-csv-writer.rb b/c_glib/test/test-csv-writer.rb new file mode 100644 index 00000000000..1eb3c1e8d9c --- /dev/null +++ b/c_glib/test/test-csv-writer.rb @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestCSVWriter < Test::Unit::TestCase + include Helper::Buildable + + def test_write_record_batch + message_data = ["Start", "Shutdown"] + count_data = [2, 9] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + CSV + assert_equal(expected, csv_output) + end + + def test_write_table + message_data = ["Start", "Shutdown", "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, nil) + begin + table = Arrow::Table.new(schema, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_table(table) + ensure + csv_writer.close + assert do + csv_writer.closed? + end + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message","count" + "Start",2 + "Shutdown",9 + "Reboot",5 + CSV + assert_equal(expected, csv_output) + end + + + sub_test_case("options") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_include_header + assert do + @options.include_header? + end + @options.include_header = false + assert do + not @options.include_header? + end + end + + def test_batch_size + assert_equal(1024, @options.batch_size) + @options.batch_size = 2048 + assert_equal(2048, @options.batch_size) + end + + def test_delimiter + assert_equal(44, @options.delimiter) # 44 is the ASCII code for comma + @options.delimiter = ";".ord + assert_equal(59, @options.delimiter) # 59 is the ASCII code for semicolon + end + + def test_null_string + assert_equal("", @options.null_string) + @options.null_string = "NULL" + assert_equal("NULL", @options.null_string) + end + + def test_eol + assert_equal("\n", @options.eol) + @options.eol = "\r\n" + assert_equal("\r\n", @options.eol) + end + + def test_quoting_style + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_style) + @options.quoting_style = Arrow::CSVQuotingStyle::ALL_VALID + assert_equal(Arrow::CSVQuotingStyle::ALL_VALID, @options.quoting_style) + end + + def test_quoting_header + assert_equal(Arrow::CSVQuotingStyle::NEEDED, @options.quoting_header) + @options.quoting_header = Arrow::CSVQuotingStyle::NONE + assert_equal(Arrow::CSVQuotingStyle::NONE, @options.quoting_header) + end + + def test_write_with_options + message_data = ["Start", nil, "Reboot"] + count_data = [2, 9, 5] + message_field = Arrow::Field.new("message", Arrow::StringDataType.new) + count_field = Arrow::Field.new("count", Arrow::Int64DataType.new) + schema = Arrow::Schema.new([message_field, count_field]) + + options = Arrow::CSVWriteOptions.new + options.include_header = false + options.delimiter = ";".ord + options.quoting_style = Arrow::CSVQuotingStyle::NONE + options.null_string = "NULL" + + buffer = Arrow::ResizableBuffer.new(0) + output = Arrow::BufferOutputStream.new(buffer) + begin + csv_writer = Arrow::CSVWriter.new(output, schema, options) + begin + record_batch = Arrow::RecordBatch.new(schema, + message_data.size, + [ + build_string_array(message_data), + build_int64_array(count_data), + ]) + csv_writer.write_record_batch(record_batch) + ensure + csv_writer.close + end + ensure + output.close + end + + csv_output = buffer.data.to_s + expected = <<~CSV + Start;2 + NULL;9 + Reboot;5 + CSV + assert_equal(expected, csv_output) + end + end +end diff --git a/ruby/red-arrow/lib/arrow/csv-write-options.rb b/ruby/red-arrow/lib/arrow/csv-write-options.rb new file mode 100644 index 00000000000..ff477784896 --- /dev/null +++ b/ruby/red-arrow/lib/arrow/csv-write-options.rb @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module Arrow + class CSVWriteOptions + alias_method :delimiter_raw, :delimiter + def delimiter + delimiter_raw.chr + end + + alias_method :delimiter_raw=, :delimiter= + def delimiter=(delimiter) + case delimiter + when String + if delimiter.bytesize != 1 + message = "delimiter must be 1 byte character: #{delimiter.inspect}" + raise ArgumentError, message + end + delimiter = delimiter.ord + end + self.delimiter_raw = delimiter + end + end +end diff --git a/ruby/red-arrow/lib/arrow/libraries.rb b/ruby/red-arrow/lib/arrow/libraries.rb index 8135a2d4e7d..3bbf621f2ba 100644 --- a/ruby/red-arrow/lib/arrow/libraries.rb +++ b/ruby/red-arrow/lib/arrow/libraries.rb @@ -39,6 +39,7 @@ require_relative "compression-type" require_relative "csv-loader" require_relative "csv-read-options" +require_relative "csv-write-options" require_relative "data-type" require_relative "date32-array" require_relative "date32-array-builder" diff --git a/ruby/red-arrow/test/test-csv-writer.rb b/ruby/red-arrow/test/test-csv-writer.rb new file mode 100644 index 00000000000..946cb99357e --- /dev/null +++ b/ruby/red-arrow/test/test-csv-writer.rb @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class CSVWriterTest < Test::Unit::TestCase + sub_test_case("CSVWriteOptions") do + def setup + @options = Arrow::CSVWriteOptions.new + end + + def test_delimiter + assert_equal(",", @options.delimiter) + @options.delimiter = ";" + assert_equal(";", @options.delimiter) + end + end + + def test_write_table + table = Arrow::Table.new({ + message: ["Start", nil, "Reboot"], + count: [2, 9, 5], + }) + + options = Arrow::CSVWriteOptions.new + options.delimiter = ";" + + buffer = Arrow::ResizableBuffer.new(0) + Arrow::BufferOutputStream.open(buffer) do |output| + Arrow::CSVWriter.open(output, table.schema, options) do |csv_writer| + csv_writer.write_table(table) + end + end + + csv_output = buffer.data.to_s + expected = <<~CSV + "message";"count" + "Start";2 + ;9 + "Reboot";5 + CSV + assert_equal(expected, csv_output) + end +end