From b9129244df55bbca71077d83d32082bc3134edf6 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Sun, 4 Jan 2026 20:47:29 +0800 Subject: [PATCH 1/5] feat: implement add column/delete column --- src/iceberg/json_internal.cc | 4 +- src/iceberg/schema_field.cc | 21 +- src/iceberg/schema_field.h | 12 +- src/iceberg/test/CMakeLists.txt | 1 + src/iceberg/test/meson.build | 9 + src/iceberg/test/update_schema_test.cc | 741 +++++++++++++++++++++++++ src/iceberg/type.cc | 10 +- src/iceberg/type.h | 1 + src/iceberg/update/update_schema.cc | 408 +++++++++++++- src/iceberg/update/update_schema.h | 22 +- src/iceberg/util/visit_type.h | 36 ++ src/iceberg/util/visitor_generate.h | 17 + 12 files changed, 1245 insertions(+), 37 deletions(-) create mode 100644 src/iceberg/test/update_schema_test.cc diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index a52f418e4..a37bfd89b 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -227,6 +227,7 @@ nlohmann::json ToJson(const SchemaField& field) { json[kName] = field.name(); json[kRequired] = !field.optional(); json[kType] = ToJson(*field.type()); + json[kDoc] = field.doc(); return json; } @@ -463,9 +464,10 @@ Result> FieldFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue(json, kId)); ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue(json, kName)); ICEBERG_ASSIGN_OR_RAISE(auto required, GetJsonValue(json, kRequired)); + ICEBERG_ASSIGN_OR_RAISE(auto doc, GetJsonValueOrDefault(json, kDoc)); return std::make_unique(field_id, std::move(name), std::move(type), - !required); + !required, doc); } Result> SchemaFromJson(const nlohmann::json& json) { diff --git a/src/iceberg/schema_field.cc b/src/iceberg/schema_field.cc index 04b55a025..206915ec2 100644 --- a/src/iceberg/schema_field.cc +++ b/src/iceberg/schema_field.cc @@ -20,28 +20,29 @@ #include "iceberg/schema_field.h" #include +#include #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { -SchemaField::SchemaField(int32_t field_id, std::string name, std::shared_ptr type, - bool optional, std::string doc) +SchemaField::SchemaField(int32_t field_id, std::string_view name, + std::shared_ptr type, bool optional, std::string_view doc) : field_id_(field_id), - name_(std::move(name)), + name_(name), type_(std::move(type)), optional_(optional), - doc_(std::move(doc)) {} + doc_(doc) {} -SchemaField SchemaField::MakeOptional(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc) { - return {field_id, std::move(name), std::move(type), true, std::move(doc)}; +SchemaField SchemaField::MakeOptional(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc) { + return {field_id, name, std::move(type), true, doc}; } -SchemaField SchemaField::MakeRequired(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc) { - return {field_id, std::move(name), std::move(type), false, std::move(doc)}; +SchemaField SchemaField::MakeRequired(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc) { + return {field_id, name, std::move(type), false, doc}; } int32_t SchemaField::field_id() const { return field_id_; } diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index c7c826d87..fd20226a5 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -46,15 +46,15 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { /// \param[in] type The field type. /// \param[in] optional Whether values of this field are required or nullable. /// \param[in] doc Optional documentation string for the field. - SchemaField(int32_t field_id, std::string name, std::shared_ptr type, - bool optional, std::string doc = {}); + SchemaField(int32_t field_id, std::string_view name, std::shared_ptr type, + bool optional, std::string_view doc = {}); /// \brief Construct an optional (nullable) field. - static SchemaField MakeOptional(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc = {}); + static SchemaField MakeOptional(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc = {}); /// \brief Construct a required (non-null) field. - static SchemaField MakeRequired(int32_t field_id, std::string name, - std::shared_ptr type, std::string doc = {}); + static SchemaField MakeRequired(int32_t field_id, std::string_view name, + std::shared_ptr type, std::string_view doc = {}); /// \brief Get the field ID. [[nodiscard]] int32_t field_id() const; diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index a32bbe4de..c7b64df2b 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -165,6 +165,7 @@ if(ICEBERG_BUILD_BUNDLE) transaction_test.cc update_partition_spec_test.cc update_properties_test.cc + update_schema_test.cc update_sort_order_test.cc) add_iceberg_test(data_writer_test USE_BUNDLE SOURCES data_writer_test.cc) diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 378182819..8f7821110 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -93,6 +93,15 @@ iceberg_tests = { ), }, 'roaring_test': {'sources': files('roaring_test.cc')}, + 'table_update_test': { + 'sources': files( + 'transaction_test.cc', + 'update_partition_spec_test.cc', + 'update_properties_test.cc', + 'update_schema_test.cc', + 'update_sort_order_test.cc', + ), + }, } if get_option('rest').enabled() diff --git a/src/iceberg/test/update_schema_test.cc b/src/iceberg/test/update_schema_test.cc new file mode 100644 index 000000000..97d3b15b0 --- /dev/null +++ b/src/iceberg/test/update_schema_test.cc @@ -0,0 +1,741 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/update/update_schema.h" + +#include + +#include + +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/test/matchers.h" +#include "iceberg/test/update_test_base.h" +#include "iceberg/type.h" + +namespace iceberg { + +class UpdateSchemaTest : public UpdateTestBase {}; + +// Test adding a simple optional column +TEST_F(UpdateSchemaTest, AddOptionalColumn) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32(), "A new integer column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the new column was added + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, result.schema->FindFieldByName("new_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + EXPECT_EQ(new_field.name(), "new_col"); + EXPECT_EQ(new_field.type(), int32()); + EXPECT_TRUE(new_field.optional()); + EXPECT_EQ(new_field.doc(), "A new integer column"); +} + +// Test adding a required column (should fail without AllowIncompatibleChanges) +TEST_F(UpdateSchemaTest, AddRequiredColumnFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddRequiredColumn("required_col", string(), "A required string column"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Incompatible change")); +} + +// Test adding a required column with AllowIncompatibleChanges +TEST_F(UpdateSchemaTest, AddRequiredColumnWithAllowIncompatible) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AllowIncompatibleChanges().AddRequiredColumn("required_col", string(), + "A required string column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the new column was added + ICEBERG_UNWRAP_OR_FAIL(auto new_field_opt, + result.schema->FindFieldByName("required_col")); + ASSERT_TRUE(new_field_opt.has_value()); + + const auto& new_field = new_field_opt->get(); + EXPECT_EQ(new_field.name(), "required_col"); + EXPECT_EQ(new_field.type(), string()); + EXPECT_FALSE(new_field.optional()); + EXPECT_EQ(new_field.doc(), "A required string column"); +} + +// Test adding multiple columns +TEST_F(UpdateSchemaTest, AddMultipleColumns) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("col1", int32(), "First column") + .AddColumn("col2", string(), "Second column") + .AddColumn("col3", boolean(), "Third column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify all columns were added + ICEBERG_UNWRAP_OR_FAIL(auto col1_opt, result.schema->FindFieldByName("col1")); + ICEBERG_UNWRAP_OR_FAIL(auto col2_opt, result.schema->FindFieldByName("col2")); + ICEBERG_UNWRAP_OR_FAIL(auto col3_opt, result.schema->FindFieldByName("col3")); + + ASSERT_TRUE(col1_opt.has_value()); + ASSERT_TRUE(col2_opt.has_value()); + ASSERT_TRUE(col3_opt.has_value()); + + EXPECT_EQ(col1_opt->get().type(), int32()); + EXPECT_EQ(col2_opt->get().type(), string()); + EXPECT_EQ(col3_opt->get().type(), boolean()); +} + +// Test adding column with dot in name should fail for top-level +TEST_F(UpdateSchemaTest, AddColumnWithDotInNameFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("col.with.dots", int32(), "Column with dots"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot add column with ambiguous name")); +} + +// Test adding column to nested struct +TEST_F(UpdateSchemaTest, AddColumnToNestedStruct) { + // First, add a struct column to the table + auto struct_type = std::make_shared(std::vector{ + SchemaField(100, "nested_field", int32(), true, "Nested field")}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type, "A struct column"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Reload table and add column to the nested struct + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("struct_col", "new_nested_field", string(), "New nested field"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the nested field was added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + ASSERT_TRUE(struct_field.type()->is_nested()); + + const auto& nested_struct = static_cast(*struct_field.type()); + ICEBERG_UNWRAP_OR_FAIL(auto nested_field_opt, + nested_struct.GetFieldByName("new_nested_field")); + ASSERT_TRUE(nested_field_opt.has_value()); + + const auto& nested_field = nested_field_opt->get(); + EXPECT_EQ(nested_field.name(), "new_nested_field"); + EXPECT_EQ(nested_field.type(), string()); +} + +// Test adding column to non-existent parent fails +TEST_F(UpdateSchemaTest, AddColumnToNonExistentParentFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("non_existent_parent", "new_field", int32(), "New field"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot find parent struct")); +} + +// Test adding column to non-struct parent fails +TEST_F(UpdateSchemaTest, AddColumnToNonStructParentFails) { + // First, add a primitive column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("primitive_col", int32(), "A primitive column"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to add column to the primitive column (should fail) + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("primitive_col", "nested_field", string(), "Should fail"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot add to non-struct column")); +} + +// Test adding duplicate column name fails +TEST_F(UpdateSchemaTest, AddDuplicateColumnNameFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("duplicate_col", int32(), "First column") + .AddColumn("duplicate_col", string(), "Duplicate column"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + EXPECT_THAT(result, HasErrorMessage("Duplicate path found")); +} + +// Test column ID assignment +TEST_F(UpdateSchemaTest, ColumnIdAssignment) { + ICEBERG_UNWRAP_OR_FAIL(auto original_schema, table_->schema()); + int32_t original_last_id = table_->metadata()->last_column_id; + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col1", int32(), "First new column") + .AddColumn("new_col2", string(), "Second new column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify new last column ID is incremented + EXPECT_EQ(result.new_last_column_id, original_last_id + 2); + + // Verify new columns have sequential IDs + ICEBERG_UNWRAP_OR_FAIL(auto col1_opt, result.schema->FindFieldByName("new_col1")); + ICEBERG_UNWRAP_OR_FAIL(auto col2_opt, result.schema->FindFieldByName("new_col2")); + + ASSERT_TRUE(col1_opt.has_value()); + ASSERT_TRUE(col2_opt.has_value()); + + EXPECT_EQ(col1_opt->get().field_id(), original_last_id + 1); + EXPECT_EQ(col2_opt->get().field_id(), original_last_id + 2); +} + +// Test adding nested struct with multiple fields +TEST_F(UpdateSchemaTest, AddNestedStructColumn) { + auto nested_struct = std::make_shared(std::vector{ + SchemaField(100, "field1", int32(), true, "First nested field"), + SchemaField(101, "field2", string(), false, "Second nested field")}); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("complex_struct", nested_struct, "A complex struct column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Verify the struct column was added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("complex_struct")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + EXPECT_EQ(struct_field.name(), "complex_struct"); + EXPECT_TRUE(struct_field.type()->is_nested()); + EXPECT_TRUE(struct_field.optional()); + + // Verify nested fields exist + const auto& nested_type = static_cast(*struct_field.type()); + EXPECT_EQ(nested_type.fields().size(), 2); + + ICEBERG_UNWRAP_OR_FAIL(auto field1_opt, nested_type.GetFieldByName("field1")); + ICEBERG_UNWRAP_OR_FAIL(auto field2_opt, nested_type.GetFieldByName("field2")); + + ASSERT_TRUE(field1_opt.has_value()); + ASSERT_TRUE(field2_opt.has_value()); + + EXPECT_EQ(field1_opt->get().type(), int32()); + EXPECT_EQ(field2_opt->get().type(), string()); + EXPECT_TRUE(field1_opt->get().optional()); + EXPECT_FALSE(field2_opt->get().optional()); +} + +// Test case sensitivity +TEST_F(UpdateSchemaTest, CaseSensitiveColumnNames) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->CaseSensitive(true) + .AddColumn("Column", int32(), "Uppercase column") + .AddColumn("column", string(), "Lowercase column"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + ASSERT_TRUE(result.schema != nullptr); + + // Both columns should exist with case-sensitive search + ICEBERG_UNWRAP_OR_FAIL(auto upper_opt, result.schema->FindFieldByName("Column", true)); + ICEBERG_UNWRAP_OR_FAIL(auto lower_opt, result.schema->FindFieldByName("column", true)); + + ASSERT_TRUE(upper_opt.has_value()); + ASSERT_TRUE(lower_opt.has_value()); + + EXPECT_EQ(upper_opt->get().type(), int32()); + EXPECT_EQ(lower_opt->get().type(), string()); +} + +// Test case insensitive duplicate detection +TEST_F(UpdateSchemaTest, CaseInsensitiveDuplicateDetection) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->CaseSensitive(false) + .AddColumn("Column", int32(), "First column") + .AddColumn("COLUMN", string(), "Duplicate column"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kInvalidSchema)); + EXPECT_THAT(result, HasErrorMessage("Duplicate path found")); +} + +// Test empty update +TEST_F(UpdateSchemaTest, EmptyUpdate) { + ICEBERG_UNWRAP_OR_FAIL(auto original_schema, table_->schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Schema should be unchanged + EXPECT_EQ(*result.schema, *original_schema); + EXPECT_EQ(result.new_last_column_id, table_->metadata()->last_column_id); +} + +// Test commit success +TEST_F(UpdateSchemaTest, CommitSuccess) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("committed_col", int64(), "A committed column"); + + EXPECT_THAT(update->Commit(), IsOk()); + + // Reload table and verify column exists + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto schema, reloaded->schema()); + + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, schema->FindFieldByName("committed_col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.name(), "committed_col"); + auto type_id = field.type()->type_id(); + + EXPECT_EQ(*field.type(), *int64()); + EXPECT_TRUE(field.optional()); + EXPECT_EQ(field.doc(), "A committed column"); +} + +// Test adding fields to map value and list element structs +TEST_F(UpdateSchemaTest, AddFieldsToMapAndList) { + // Create a schema with map and list of structs + auto map_key_struct = std::make_shared( + std::vector{SchemaField(20, "address", string(), false), + SchemaField(21, "city", string(), false)}); + + auto map_value_struct = std::make_shared( + std::vector{SchemaField(12, "lat", float32(), false), + SchemaField(13, "long", float32(), false)}); + + auto map_type = + std::make_shared(SchemaField(10, "key", map_key_struct, false), + SchemaField(11, "value", map_value_struct, false)); + + auto list_element_struct = std::make_shared(std::vector{ + SchemaField(15, "x", int64(), false), SchemaField(16, "y", int64(), false)}); + + auto list_type = + std::make_shared(SchemaField(14, "element", list_element_struct, true)); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("locations", map_type, "map of address to coordinate") + .AddColumn("points", list_type, "2-D cartesian points"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Reload and add fields to nested structs + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update + ->AddColumn("locations", "alt", float32(), "altitude") // add to map value + .AddColumn("points", "z", int64(), "z coordinate"); // add to list element + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify map value has new field + ICEBERG_UNWRAP_OR_FAIL(auto locations_opt, result.schema->FindFieldByName("locations")); + ASSERT_TRUE(locations_opt.has_value()); + const auto& locations_field = locations_opt->get(); + ASSERT_EQ(locations_field.type()->type_id(), TypeId::kMap); + + const auto& map = static_cast(*locations_field.type()); + const auto& value_struct = static_cast(*map.value().type()); + ICEBERG_UNWRAP_OR_FAIL(auto alt_opt, value_struct.GetFieldByName("alt")); + ASSERT_TRUE(alt_opt.has_value()); + EXPECT_EQ(alt_opt->get().type(), float32()); + + // Verify list element has new field + ICEBERG_UNWRAP_OR_FAIL(auto points_opt, result.schema->FindFieldByName("points")); + ASSERT_TRUE(points_opt.has_value()); + const auto& points_field = points_opt->get(); + ASSERT_EQ(points_field.type()->type_id(), TypeId::kList); + + const auto& list = static_cast(*points_field.type()); + const auto& element_struct = static_cast(*list.element().type()); + ICEBERG_UNWRAP_OR_FAIL(auto z_opt, element_struct.GetFieldByName("z")); + ASSERT_TRUE(z_opt.has_value()); + EXPECT_EQ(z_opt->get().type(), int64()); +} + +// Test adding nested struct with ID reassignment +TEST_F(UpdateSchemaTest, AddNestedStructWithIdReassignment) { + // Create a struct with conflicting IDs (will be reassigned) + auto nested_struct = std::make_shared(std::vector{ + SchemaField(1, "lat", int32(), false), // ID 1 conflicts with existing schema + SchemaField(2, "long", int32(), true)}); // ID 2 conflicts with existing schema + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("location", nested_struct); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the struct was added with reassigned IDs + ICEBERG_UNWRAP_OR_FAIL(auto location_opt, result.schema->FindFieldByName("location")); + ASSERT_TRUE(location_opt.has_value()); + + const auto& location_field = location_opt->get(); + ASSERT_TRUE(location_field.type()->is_nested()); + + const auto& struct_type = static_cast(*location_field.type()); + ASSERT_EQ(struct_type.fields().size(), 2); + + // IDs should be reassigned to avoid conflicts + ICEBERG_UNWRAP_OR_FAIL(auto lat_opt, struct_type.GetFieldByName("lat")); + ICEBERG_UNWRAP_OR_FAIL(auto long_opt, struct_type.GetFieldByName("long")); + + ASSERT_TRUE(lat_opt.has_value()); + ASSERT_TRUE(long_opt.has_value()); + + // IDs should be > 1 (reassigned) + EXPECT_GT(lat_opt->get().field_id(), 1); + EXPECT_GT(long_opt->get().field_id(), 1); +} + +// Test adding nested map of structs with ID reassignment +TEST_F(UpdateSchemaTest, AddNestedMapOfStructs) { + auto key_struct = std::make_shared(std::vector{ + SchemaField(20, "address", string(), false), + SchemaField(21, "city", string(), false), SchemaField(22, "state", string(), false), + SchemaField(23, "zip", int32(), false)}); + + auto value_struct = std::make_shared(std::vector{ + SchemaField(9, "lat", int32(), false), SchemaField(8, "long", int32(), true)}); + + auto map_type = std::make_shared(SchemaField(1, "key", key_struct, false), + SchemaField(2, "value", value_struct, true)); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("locations", map_type); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the map was added with reassigned IDs + ICEBERG_UNWRAP_OR_FAIL(auto locations_opt, result.schema->FindFieldByName("locations")); + ASSERT_TRUE(locations_opt.has_value()); + + const auto& locations_field = locations_opt->get(); + ASSERT_EQ(locations_field.type()->type_id(), TypeId::kMap); + + const auto& map = static_cast(*locations_field.type()); + + // Verify key struct fields + const auto& key_struct_type = static_cast(*map.key().type()); + EXPECT_EQ(key_struct_type.fields().size(), 4); + + // Verify value struct fields + const auto& value_struct_type = static_cast(*map.value().type()); + EXPECT_EQ(value_struct_type.fields().size(), 2); + + ICEBERG_UNWRAP_OR_FAIL(auto lat_opt, value_struct_type.GetFieldByName("lat")); + ICEBERG_UNWRAP_OR_FAIL(auto long_opt, value_struct_type.GetFieldByName("long")); + + ASSERT_TRUE(lat_opt.has_value()); + ASSERT_TRUE(long_opt.has_value()); +} + +// Test adding nested list of structs with ID reassignment +TEST_F(UpdateSchemaTest, AddNestedListOfStructs) { + auto element_struct = std::make_shared(std::vector{ + SchemaField(9, "lat", int32(), false), SchemaField(8, "long", int32(), true)}); + + auto list_type = + std::make_shared(SchemaField(1, "element", element_struct, true)); + + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("locations", list_type); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the list was added with reassigned IDs + ICEBERG_UNWRAP_OR_FAIL(auto locations_opt, result.schema->FindFieldByName("locations")); + ASSERT_TRUE(locations_opt.has_value()); + + const auto& locations_field = locations_opt->get(); + ASSERT_EQ(locations_field.type()->type_id(), TypeId::kList); + + const auto& list = static_cast(*locations_field.type()); + const auto& element_struct_type = + static_cast(*list.element().type()); + + EXPECT_EQ(element_struct_type.fields().size(), 2); + + ICEBERG_UNWRAP_OR_FAIL(auto lat_opt, element_struct_type.GetFieldByName("lat")); + ICEBERG_UNWRAP_OR_FAIL(auto long_opt, element_struct_type.GetFieldByName("long")); + + ASSERT_TRUE(lat_opt.has_value()); + ASSERT_TRUE(long_opt.has_value()); +} + +// Test adding field with dots in name to nested struct +TEST_F(UpdateSchemaTest, AddFieldWithDotsInName) { + // First add a struct column + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", int32(), true)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Add a field with dots in its name to the nested struct + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("struct_col", "field.with.dots", int64(), "Field with dots in name"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the field with dots was added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + const auto& nested_struct = static_cast(*struct_field.type()); + + ICEBERG_UNWRAP_OR_FAIL(auto dotted_field_opt, + nested_struct.GetFieldByName("field.with.dots")); + ASSERT_TRUE(dotted_field_opt.has_value()); + EXPECT_EQ(dotted_field_opt->get().name(), "field.with.dots"); + EXPECT_EQ(dotted_field_opt->get().type(), int64()); +} + +// Test adding field to map key should fail +TEST_F(UpdateSchemaTest, AddFieldToMapKeyFails) { + // Create a map with struct key + auto key_struct = std::make_shared( + std::vector{SchemaField(20, "address", string(), false)}); + + auto value_struct = std::make_shared( + std::vector{SchemaField(12, "lat", float32(), false)}); + + auto map_type = + std::make_shared(SchemaField(10, "key", key_struct, false), + SchemaField(11, "value", value_struct, false)); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("locations", map_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to add field to map key (should fail) + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("locations.key", "city", string(), "Should fail"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot add fields to map keys")); +} + +// Test deleting a column +TEST_F(UpdateSchemaTest, DeleteColumn) { + // First add a column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("to_delete", string(), "A column to delete"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete the column + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("to_delete"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the column was deleted + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("to_delete")); + EXPECT_FALSE(field_opt.has_value()); +} + +// Test deleting a nested column +TEST_F(UpdateSchemaTest, DeleteNestedColumn) { + // First add a struct with nested fields + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", int32(), true), + SchemaField(101, "field2", string(), true)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete one of the nested fields + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("struct_col.field1"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify field1 was deleted but field2 still exists + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + const auto& nested_struct = static_cast(*struct_field.type()); + + ICEBERG_UNWRAP_OR_FAIL(auto field1_opt, nested_struct.GetFieldByName("field1")); + ICEBERG_UNWRAP_OR_FAIL(auto field2_opt, nested_struct.GetFieldByName("field2")); + + EXPECT_FALSE(field1_opt.has_value()); + EXPECT_TRUE(field2_opt.has_value()); +} + +// Test deleting missing column fails +TEST_F(UpdateSchemaTest, DeleteMissingColumnFails) { + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->DeleteColumn("non_existent"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete missing column")); +} + +// Test delete then add same column +TEST_F(UpdateSchemaTest, DeleteThenAdd) { + // First add a required column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AllowIncompatibleChanges().AddRequiredColumn("col", int32(), + "Required column"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete then add with different type and optional + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("col").AddColumn("col", string(), "Now optional string"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the column was re-added with new properties + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, result.schema->FindFieldByName("col")); + ASSERT_TRUE(field_opt.has_value()); + + const auto& field = field_opt->get(); + EXPECT_EQ(field.type(), string()); + EXPECT_TRUE(field.optional()); +} + +// Test delete then add nested field +TEST_F(UpdateSchemaTest, DeleteThenAddNested) { + // First add a struct with a field + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", boolean(), false)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete then re-add the nested field with different type + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("struct_col.field1") + .AddColumn("struct_col", "field1", int32(), "Re-added field"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the field was re-added + ICEBERG_UNWRAP_OR_FAIL(auto struct_field_opt, + result.schema->FindFieldByName("struct_col")); + ASSERT_TRUE(struct_field_opt.has_value()); + + const auto& struct_field = struct_field_opt->get(); + const auto& nested_struct = static_cast(*struct_field.type()); + + ICEBERG_UNWRAP_OR_FAIL(auto field1_opt, nested_struct.GetFieldByName("field1")); + ASSERT_TRUE(field1_opt.has_value()); + EXPECT_EQ(field1_opt->get().type(), int32()); +} + +// Test add-delete conflict +TEST_F(UpdateSchemaTest, AddDeleteConflict) { + // Try to delete a newly added column (should fail - column doesn't exist in schema) + ICEBERG_UNWRAP_OR_FAIL(auto update, table_->NewUpdateSchema()); + update->AddColumn("new_col", int32()).DeleteColumn("new_col"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete missing column")); +} + +// Test delete column that has additions fails +TEST_F(UpdateSchemaTest, DeleteColumnWithAdditionsFails) { + // First add a struct + auto struct_type = std::make_shared( + std::vector{SchemaField(100, "field1", int32(), true)}); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("struct_col", struct_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to add a field to the struct and delete it in the same update + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->AddColumn("struct_col", "field2", string()).DeleteColumn("struct_col"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete a column that has additions")); +} + +// Test delete map key fails +TEST_F(UpdateSchemaTest, DeleteMapKeyFails) { + // Create a map + auto map_type = std::make_shared(SchemaField(10, "key", string(), false), + SchemaField(11, "value", int32(), true)); + + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("map_col", map_type); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Try to delete the map key (should fail in Apply) + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->DeleteColumn("map_col.key"); + + auto result = update->Apply(); + EXPECT_THAT(result, IsError(ErrorKind::kValidationFailed)); + EXPECT_THAT(result, HasErrorMessage("Cannot delete map keys")); +} + +// Test case insensitive delete +TEST_F(UpdateSchemaTest, DeleteColumnCaseInsensitive) { + // First add a column + ICEBERG_UNWRAP_OR_FAIL(auto setup_update, table_->NewUpdateSchema()); + setup_update->AddColumn("MyColumn", string(), "A column with mixed case"); + EXPECT_THAT(setup_update->Commit(), IsOk()); + + // Delete using different case + ICEBERG_UNWRAP_OR_FAIL(auto reloaded, catalog_->LoadTable(table_ident_)); + ICEBERG_UNWRAP_OR_FAIL(auto update, reloaded->NewUpdateSchema()); + update->CaseSensitive(false).DeleteColumn("mycolumn"); + + ICEBERG_UNWRAP_OR_FAIL(auto result, update->Apply()); + + // Verify the column was deleted + ICEBERG_UNWRAP_OR_FAIL(auto field_opt, + result.schema->FindFieldByName("MyColumn", false)); + EXPECT_FALSE(field_opt.has_value()); +} + +} // namespace iceberg diff --git a/src/iceberg/type.cc b/src/iceberg/type.cc index ed10e127e..b4e876780 100644 --- a/src/iceberg/type.cc +++ b/src/iceberg/type.cc @@ -27,6 +27,7 @@ #include "iceberg/exception.h" #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep #include "iceberg/util/macros.h" #include "iceberg/util/string_util.h" @@ -151,7 +152,12 @@ ListType::ListType(int32_t field_id, std::shared_ptr type, bool optional) : element_(field_id, std::string(kElementName), std::move(type), optional) {} TypeId ListType::type_id() const { return kTypeId; } -std::string ListType::ToString() const { return std::format("list<{}>", element_); } + +const SchemaField& ListType::element() const { return element_; } + +std::string ListType::ToString() const { + return std::vformat("list<{}>", std::make_format_args(element_)); +} std::span ListType::fields() const { return {&element_, 1}; } Result> ListType::GetFieldById( @@ -207,7 +213,7 @@ const SchemaField& MapType::value() const { return fields_[1]; } TypeId MapType::type_id() const { return kTypeId; } std::string MapType::ToString() const { - return std::format("map<{}: {}>", key(), value()); + return std::vformat("map<{}: {}>", std::make_format_args(key(), value())); } std::span MapType::fields() const { return fields_; } diff --git a/src/iceberg/type.h b/src/iceberg/type.h index 7e17b78d4..1c50135dc 100644 --- a/src/iceberg/type.h +++ b/src/iceberg/type.h @@ -155,6 +155,7 @@ class ICEBERG_EXPORT ListType : public NestedType { ~ListType() override = default; TypeId type_id() const override; + const SchemaField& element() const; std::string ToString() const override; std::span fields() const override; diff --git a/src/iceberg/update/update_schema.cc b/src/iceberg/update/update_schema.cc index 14b962bd8..402cfb472 100644 --- a/src/iceberg/update/update_schema.cc +++ b/src/iceberg/update/update_schema.cc @@ -19,6 +19,7 @@ #include "iceberg/update/update_schema.h" +#include #include #include #include @@ -26,16 +27,204 @@ #include #include #include +#include #include "iceberg/schema.h" +#include "iceberg/schema_field.h" #include "iceberg/table_metadata.h" #include "iceberg/transaction.h" #include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/error_collector.h" #include "iceberg/util/macros.h" +#include "iceberg/util/type_util.h" +#include "iceberg/util/visit_type.h" namespace iceberg { +namespace { +constexpr int32_t kTableRootId = -1; + +/// \brief Visitor for applying schema changes recursively to nested types +class ApplyChangesVisitor { + public: + ApplyChangesVisitor( + const std::unordered_set& deletes, + const std::unordered_map>& updates, + const std::unordered_map>& parent_to_added_ids) + : deletes_(deletes), updates_(updates), parent_to_added_ids_(parent_to_added_ids) {} + + /// \brief Apply changes to a type using schema visitor pattern + Result> ApplyChanges(const std::shared_ptr& type, + int32_t parent_id) { + return VisitSchemaInline(*type, this, type, parent_id); + } + + /// \brief Apply changes to a struct type + Result> VisitStruct(const StructType& struct_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + std::vector new_fields; + + // Process existing fields + for (const auto& field : struct_type.fields()) { + // Recursively process the field's type first + ICEBERG_ASSIGN_OR_RAISE(auto field_type_result, + ApplyChanges(field.type(), field.field_id())); + + // Process field-level changes (deletes, updates, nested additions) + ICEBERG_ASSIGN_OR_RAISE(auto processed_field, + ProcessField(field, field_type_result)); + + if (processed_field.has_value()) { + new_fields.push_back(std::move(processed_field.value())); + } + } + + // Add new fields for this struct + auto adds_it = parent_to_added_ids_.find(parent_id); + if (adds_it != parent_to_added_ids_.end()) { + for (int32_t added_id : adds_it->second) { + auto added_field_it = updates_.find(added_id); + if (added_field_it != updates_.end()) { + new_fields.push_back(*added_field_it->second); + } + } + } + + return std::make_shared(std::move(new_fields)); + } + + /// \brief Apply changes to a list type + Result> VisitList(const ListType& list_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + const auto& element = list_type.element(); + + // Recursively process element type + ICEBERG_ASSIGN_OR_RAISE(auto element_type_result, + ApplyChanges(element.type(), element.field_id())); + + // Process element field (handles deletes, updates, nested additions) + ICEBERG_ASSIGN_OR_RAISE(auto processed_element, + ProcessField(element, element_type_result)); + + ICEBERG_CHECK(processed_element.has_value(), + "Cannot delete element field from list: {}", list_type.ToString()); + + const auto& new_element = processed_element.value(); + + // Return unchanged if element didn't change + if (element == new_element) { + return base_type; + } + + return std::make_shared(new_element); + } + + /// \brief Apply changes to a map type + Result> VisitMap(const MapType& map_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + const auto& key = map_type.key(); + const auto& value = map_type.value(); + + // Check for key modifications (not allowed in Iceberg) + int32_t key_id = key.field_id(); + ICEBERG_CHECK(!deletes_.contains(key_id), "Cannot delete map keys"); + ICEBERG_CHECK(!updates_.contains(key_id), "Cannot update map keys"); + ICEBERG_CHECK(!parent_to_added_ids_.contains(key_id), + "Cannot add fields to map keys"); + + // Recursively process key and value types + ICEBERG_ASSIGN_OR_RAISE(auto key_type_result, ApplyChanges(key.type(), key_id)); + ICEBERG_ASSIGN_OR_RAISE(auto value_type_result, + ApplyChanges(value.type(), value.field_id())); + + // Key type must not change + ICEBERG_CHECK(*key_type_result == *key.type(), "Cannot alter map keys"); + + // Process value field (handles deletes, updates, nested additions) + ICEBERG_ASSIGN_OR_RAISE(auto processed_value, ProcessField(value, value_type_result)); + + ICEBERG_CHECK(processed_value.has_value(), "Cannot delete value field from map: {}", + map_type.ToString()); + + const auto& new_value = processed_value.value(); + + // Return unchanged if nothing changed + if (key == map_type.key() && value == new_value) { + return base_type; + } + + return std::make_shared(key, new_value); + } + + /// \brief Handle primitive types - return unchanged + Result> VisitPrimitive(const PrimitiveType& primitive_type, + const std::shared_ptr& base_type, + int32_t parent_id) { + // Primitive types are returned as-is + return base_type; + } + + private: + /// \brief Process a field: handle deletes, updates, and nested additions + /// + /// It processes field-level operations after the field's type has been recursively + /// processed. + Result> ProcessField( + const SchemaField& field, const std::shared_ptr& field_type_result) { + int32_t field_id = field.field_id(); + + // 1. Handle deletes + if (deletes_.contains(field_id)) { + // Field is deleted + return std::nullopt; + } + + // 2. Start with the recursively processed type + std::shared_ptr result_type = field_type_result; + + // 3. Handle type updates (e.g., type widening) + // Note: We check the update against the ORIGINAL field type, not the recursively + // processed type, because we want to preserve nested changes from recursion + auto update_it = updates_.find(field_id); + if (update_it != updates_.end()) { + const auto& update_field = update_it->second; + // If the update specifies a type change, use the new type + // Otherwise keep the recursively processed type + if (update_field->type() != field.type()) { + result_type = update_field->type(); + } + } + + // Note: Nested field additions are handled in VisitStruct, not here + // to avoid duplication + + // 4. Build the result field + if (update_it != updates_.end()) { + // Use update field metadata but with the processed type + const auto& update_field = update_it->second; + return SchemaField(field_id, update_field->name(), std::move(result_type), + update_field->optional(), update_field->doc()); + } else if (result_type != field.type()) { + // Type changed but no field-level update + return SchemaField(field_id, field.name(), std::move(result_type), field.optional(), + field.doc()); + } else { + // No changes + return field; + } + } + + const std::unordered_set& deletes_; + const std::unordered_map>& updates_; + const std::unordered_map>& parent_to_added_ids_; +}; + +} // namespace + Result> UpdateSchema::Make( std::shared_ptr transaction) { ICEBERG_PRECHECK(transaction != nullptr, @@ -64,8 +253,10 @@ UpdateSchema::UpdateSchema(std::shared_ptr transaction) AddError(identifier_names_result.error()); return; } - identifier_field_names_ = identifier_names_result.value() | - std::ranges::to>(); + identifier_field_names_ = identifier_names_result.value(); + + // Initialize id_to_parent map from the schema + id_to_parent_ = IndexParents(*schema_); } UpdateSchema::~UpdateSchema() = default; @@ -132,16 +323,6 @@ UpdateSchema& UpdateSchema::UpdateColumnDoc(std::string_view name, return *this; } -UpdateSchema& UpdateSchema::AddColumnInternal(std::optional parent, - std::string_view name, bool is_optional, - std::shared_ptr type, - std::string_view doc) { - // TODO(Guotao Yu): Implement AddColumnInternal logic - // This is where the real work happens - finding parent, validating, etc. - AddError(NotImplemented("UpdateSchema::AddColumnInternal not implemented")); - return *this; -} - UpdateSchema& UpdateSchema::RenameColumn(std::string_view name, std::string_view new_name) { // TODO(Guotao Yu): Implement RenameColumn @@ -162,8 +343,23 @@ UpdateSchema& UpdateSchema::RequireColumn(std::string_view name) { } UpdateSchema& UpdateSchema::DeleteColumn(std::string_view name) { - // TODO(Guotao Yu): Implement DeleteColumn - AddError(NotImplemented("UpdateSchema::DeleteColumn not implemented")); + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto field_opt, FindField(name)); + ICEBERG_BUILDER_CHECK(field_opt.has_value(), "Cannot delete missing column: {}", name); + + const auto& field = field_opt->get(); + int32_t field_id = field.field_id(); + + // Check the field doesn't have additions + ICEBERG_BUILDER_CHECK(!parent_to_added_ids_.contains(field_id), + "Cannot delete a column that has additions: {}", name); + + // Check the field doesn't have updates + ICEBERG_BUILDER_CHECK(!updates_.contains(field_id), + "Cannot delete a column that has updates: {}", name); + + // Add to deletes set + deletes_.insert(field_id); + return *this; } @@ -195,13 +391,191 @@ UpdateSchema& UpdateSchema::UnionByNameWith(std::shared_ptr new_schema) UpdateSchema& UpdateSchema::SetIdentifierFields( const std::span& names) { - identifier_field_names_ = names | std::ranges::to>(); + identifier_field_names_ = names | std::ranges::to>(); return *this; } Result UpdateSchema::Apply() { - // TODO(Guotao Yu): Implement Apply - return NotImplemented("UpdateSchema::Apply not implemented"); + ICEBERG_RETURN_UNEXPECTED(CheckErrors()); + + // Validate existing identifier fields are not deleted + for (const auto& name : identifier_field_names_) { + ICEBERG_ASSIGN_OR_RAISE(auto field_opt, FindField(name)); + if (field_opt.has_value()) { + const auto& field = field_opt->get(); + auto field_id = field.field_id(); + + // Check the field itself is not deleted + ICEBERG_CHECK(!deletes_.contains(field_id), + "Cannot delete identifier field {}. To force deletion, also call " + "SetIdentifierFields to update identifier fields.", + name); + + // Check no parent of this field is deleted + auto parent_it = id_to_parent_.find(field_id); + while (parent_it != id_to_parent_.end()) { + int32_t parent_id = parent_it->second; + ICEBERG_ASSIGN_OR_RAISE(auto parent_field_opt, schema_->FindFieldById(parent_id)); + ICEBERG_CHECK( + !deletes_.contains(parent_id), + "Cannot delete field {} as it will delete nested identifier field {}", + parent_field_opt.has_value() ? std::string(parent_field_opt->get().name()) + : std::to_string(parent_id), + name); + parent_it = id_to_parent_.find(parent_id); + } + } + } + + // Apply schema changes using visitor pattern + // Create a temporary struct type from the schema to use with the visitor + auto schema_struct_type = std::make_shared( + schema_->fields() | std::ranges::to>()); + + // Apply changes recursively using the visitor + ApplyChangesVisitor visitor(deletes_, updates_, parent_to_added_ids_); + ICEBERG_ASSIGN_OR_RAISE(auto new_type, + visitor.ApplyChanges(schema_struct_type, kTableRootId)); + + // Cast result back to StructType and extract fields + auto new_struct_type = internal::checked_pointer_cast(new_type); + std::vector new_fields(new_struct_type->fields() | + std::ranges::to>()); + + // Convert identifier field names to IDs + auto temp_schema = std::make_shared(new_fields); + std::vector fresh_identifier_ids; + for (const auto& name : identifier_field_names_) { + ICEBERG_ASSIGN_OR_RAISE(auto field_opt, + temp_schema->FindFieldByName(name, case_sensitive_)); + ICEBERG_CHECK(field_opt.has_value(), + "Cannot add field {} as an identifier field: not found in current " + "schema or added columns", + name); + fresh_identifier_ids.push_back(field_opt->get().field_id()); + } + + // Create the new schema + ICEBERG_ASSIGN_OR_RAISE( + auto new_schema, + Schema::Make(std::move(new_fields), schema_->schema_id(), fresh_identifier_ids)); + + return ApplyResult{.schema = std::move(new_schema), + .new_last_column_id = last_column_id_}; +} + +UpdateSchema& UpdateSchema::AddColumnInternal(std::optional parent, + std::string_view name, bool is_optional, + std::shared_ptr type, + std::string_view doc) { + int32_t parent_id = kTableRootId; + std::string full_name; + + // Handle parent field + if (parent.has_value() && !parent->empty()) { + // Find parent field + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto parent_field_opt, FindField(*parent)); + ICEBERG_BUILDER_CHECK(parent_field_opt.has_value(), "Cannot find parent struct: {}", + *parent); + + const SchemaField& parent_field = parent_field_opt->get(); + const auto& parent_type = parent_field.type(); + + // Get the actual field to add to (handle map/list) + const SchemaField* target_field = &parent_field; + if (parent_type->is_nested()) { + const auto& nested = internal::checked_cast(*parent_type); + if (nested.type_id() == TypeId::kMap) { + // For maps, add to value field + const auto& map_type = internal::checked_cast(nested); + target_field = &map_type.value(); + } else if (nested.type_id() == TypeId::kList) { + // For lists, add to element field + const auto& list_type = internal::checked_cast(nested); + target_field = &list_type.element(); + } + } + + // Validate target is a struct + ICEBERG_BUILDER_CHECK(target_field->type()->is_nested() && + target_field->type()->type_id() == TypeId::kStruct, + "Cannot add to non-struct column: {}: {}", *parent, + target_field->type()->ToString()); + + parent_id = target_field->field_id(); + + // Check parent is not being deleted + ICEBERG_BUILDER_CHECK(!deletes_.contains(parent_id), + "Cannot add to a column that will be deleted: {}", *parent); + + // Check field doesn't already exist (unless it's being deleted) + std::string nested_name = std::format("{}.{}", *parent, name); + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field_opt, FindField(nested_name)); + if (current_field_opt.has_value()) { + const auto& current_field = current_field_opt->get(); + ICEBERG_BUILDER_CHECK(deletes_.contains(current_field.field_id()), + "Cannot add column, name already exists: {}.{}", *parent, + name); + } + + // Build full name using canonical name of parent + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto parent_name_opt, + schema_->FindColumnNameById(parent_id)); + ICEBERG_BUILDER_CHECK(parent_name_opt.has_value(), + "Cannot find column name for parent id: {}", parent_id); + full_name = std::format("{}.{}", *parent_name_opt, name); + } else { + // Top-level field + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field_opt, FindField(name)); + if (current_field_opt.has_value()) { + const auto& current_field = current_field_opt->get(); + ICEBERG_BUILDER_CHECK(deletes_.contains(current_field.field_id()), + "Cannot add column, name already exists: {}", name); + } + full_name = std::string(name); + } + + // V3 supports default values, but this implementation doesn't support them yet + // Check for incompatible change: adding required column without default + ICEBERG_BUILDER_CHECK( + is_optional || allow_incompatible_changes_, + "Incompatible change: cannot add required column without a default value: {}", + full_name); + + // Assign new column ID + int32_t new_id = AssignNewColumnId(); + + // Update tracking for moves + added_name_to_id_[full_name] = new_id; + if (parent_id != kTableRootId) { + id_to_parent_[new_id] = parent_id; + } + + // Assign fresh IDs to nested types + AssignFreshIdVisitor id_assigner([this]() { return AssignNewColumnId(); }); + auto type_with_fresh_ids = id_assigner.Visit(type); + + // Create new field + auto new_field = std::make_shared(new_id, std::string(name), + std::move(type_with_fresh_ids), + is_optional, std::string(doc)); + + // Record the update + updates_[new_id] = std::move(new_field); + parent_to_added_ids_[parent_id].push_back(new_id); + + return *this; +} + +int32_t UpdateSchema::AssignNewColumnId() { + int32_t next = last_column_id_ + 1; + last_column_id_ = next; + return next; +} + +Result>> UpdateSchema::FindField( + std::string_view name) const { + return schema_->FindFieldByName(name, case_sensitive_); } } // namespace iceberg diff --git a/src/iceberg/update/update_schema.h b/src/iceberg/update/update_schema.h index bed2bfeb2..d68c291d8 100644 --- a/src/iceberg/update/update_schema.h +++ b/src/iceberg/update/update_schema.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "iceberg/iceberg_export.h" @@ -345,12 +346,31 @@ class ICEBERG_EXPORT UpdateSchema : public PendingUpdate { std::string_view name, bool is_optional, std::shared_ptr type, std::string_view doc); + /// \brief Assign a new column ID and increment the counter. + int32_t AssignNewColumnId(); + + /// \brief Find a field by name using case-sensitive or case-insensitive search. + Result>> FindField( + std::string_view name) const; + // Internal state std::shared_ptr schema_; int32_t last_column_id_; bool allow_incompatible_changes_{false}; bool case_sensitive_{true}; - std::unordered_set identifier_field_names_; + std::vector identifier_field_names_; + + // Tracking changes + // field ID -> parent field ID + std::unordered_map id_to_parent_; + // field IDs to delete + std::unordered_set deletes_; + // field ID -> updated field + std::unordered_map> updates_; + // parent ID -> added child IDs + std::unordered_map> parent_to_added_ids_; + // full name -> field ID for added fields + std::unordered_map added_name_to_id_; }; } // namespace iceberg diff --git a/src/iceberg/util/visit_type.h b/src/iceberg/util/visit_type.h index ce734e8f0..d81c36356 100644 --- a/src/iceberg/util/visit_type.h +++ b/src/iceberg/util/visit_type.h @@ -124,4 +124,40 @@ inline Status VisitTypeIdInline(TypeId id, VISITOR* visitor, ARGS&&... args) { #undef TYPE_ID_VISIT_INLINE +/// \brief Visit a type using a schema visitor pattern +/// +/// This function provides a simplified visitor interface that groups Iceberg types into +/// four categories based on their structural properties: +/// +/// - **Struct types**: Complex types with named fields (StructType) +/// - **List types**: Sequential container types (ListType) +/// - **Map types**: Key-value container types (MapType) +/// - **Primitive types**: All leaf types without nested structure (14 primitive types) +/// +/// This grouping is useful for algorithms that need to distinguish between container +/// types and leaf types, but don't require separate handling for each primitive type +/// variant (e.g., Int vs Long vs String). +/// +/// \tparam VISITOR Visitor class that must implement four Visit methods: +/// - `VisitStruct(const StructType&, ARGS...)` for struct types +/// - `VisitList(const ListType&, ARGS...)` for list types +/// - `VisitMap(const MapType&, ARGS...)` for map types +/// - `VisitPrimitive(const PrimitiveType&, ARGS...)` for all primitive types +/// \tparam ARGS Additional argument types forwarded to Visit methods +/// \param type The type to visit +/// \param visitor Pointer to the visitor instance +/// \param args Additional arguments forwarded to the Visit methods +/// \return The return value from the invoked Visit method +template +inline auto VisitSchemaInline(const Type& type, VISITOR* visitor, ARGS&&... args) { +#define SCHEMA_VISIT_ACTION(TYPE_CLASS) \ + return visitor->Visit##TYPE_CLASS( \ + internal::checked_cast(type), \ + std::forward(args)...); + + switch (type.type_id()) { ICEBERG_GENERATE_SCHEMA_VISITOR_CASES(SCHEMA_VISIT_ACTION) } + +#undef SCHEMA_VISIT_ACTION +} + } // namespace iceberg diff --git a/src/iceberg/util/visitor_generate.h b/src/iceberg/util/visitor_generate.h index 2ea8282cb..b4c6dc877 100644 --- a/src/iceberg/util/visitor_generate.h +++ b/src/iceberg/util/visitor_generate.h @@ -40,4 +40,21 @@ namespace iceberg { ACTION(List); \ ACTION(Map); +/// \brief Generate switch-case for schema visitor pattern +/// +/// This macro generates switch cases that dispatch to visitor methods based on type: +/// - Struct types -> calls ACTION with Struct +/// - List types -> calls ACTION with List +/// - Map types -> calls ACTION with Map +/// - All primitive types (default) -> calls ACTION with Primitive +#define ICEBERG_GENERATE_SCHEMA_VISITOR_CASES(ACTION) \ + case ::iceberg::TypeId::kStruct: \ + ACTION(Struct) \ + case ::iceberg::TypeId::kList: \ + ACTION(List) \ + case ::iceberg::TypeId::kMap: \ + ACTION(Map) \ + default: \ + ACTION(Primitive) + } // namespace iceberg From 585ec9061b949d226147fccc388b61d9d5a7c3ac Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Sun, 4 Jan 2026 22:27:07 +0800 Subject: [PATCH 2/5] fix ut failure --- src/iceberg/json_internal.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index a37bfd89b..c0f33d375 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -227,7 +227,9 @@ nlohmann::json ToJson(const SchemaField& field) { json[kName] = field.name(); json[kRequired] = !field.optional(); json[kType] = ToJson(*field.type()); - json[kDoc] = field.doc(); + if (!field.doc().empty()) { + json[kDoc] = field.doc(); + } return json; } From f79e7b34f1684989c37a01dc3ab4c5e5c251a274 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Mon, 5 Jan 2026 00:16:42 +0800 Subject: [PATCH 3/5] fix cpp-linter --- src/iceberg/test/update_schema_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/iceberg/test/update_schema_test.cc b/src/iceberg/test/update_schema_test.cc index 97d3b15b0..ce3c77ccd 100644 --- a/src/iceberg/test/update_schema_test.cc +++ b/src/iceberg/test/update_schema_test.cc @@ -315,8 +315,6 @@ TEST_F(UpdateSchemaTest, CommitSuccess) { const auto& field = field_opt->get(); EXPECT_EQ(field.name(), "committed_col"); - auto type_id = field.type()->type_id(); - EXPECT_EQ(*field.type(), *int64()); EXPECT_TRUE(field.optional()); EXPECT_EQ(field.doc(), "A committed column"); From e5b1be3e6b48bcdc7ddfdcd2215f21cc4ab7abc2 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Mon, 5 Jan 2026 09:49:07 +0800 Subject: [PATCH 4/5] fix meson build error --- src/iceberg/test/meson.build | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 8f7821110..378182819 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -93,15 +93,6 @@ iceberg_tests = { ), }, 'roaring_test': {'sources': files('roaring_test.cc')}, - 'table_update_test': { - 'sources': files( - 'transaction_test.cc', - 'update_partition_spec_test.cc', - 'update_properties_test.cc', - 'update_schema_test.cc', - 'update_sort_order_test.cc', - ), - }, } if get_option('rest').enabled() From d026845e04e783c379fa61d0db58059de41d7d35 Mon Sep 17 00:00:00 2001 From: Guotao Yu Date: Tue, 6 Jan 2026 18:04:39 +0800 Subject: [PATCH 5/5] resolve some comments --- src/iceberg/update/update_schema.cc | 104 +++++++++++++--------------- src/iceberg/util/visit_type.h | 8 ++- src/iceberg/util/visitor_generate.h | 16 ++--- 3 files changed, 62 insertions(+), 66 deletions(-) diff --git a/src/iceberg/update/update_schema.cc b/src/iceberg/update/update_schema.cc index 402cfb472..0e81c4ad7 100644 --- a/src/iceberg/update/update_schema.cc +++ b/src/iceberg/update/update_schema.cc @@ -57,7 +57,7 @@ class ApplyChangesVisitor { /// \brief Apply changes to a type using schema visitor pattern Result> ApplyChanges(const std::shared_ptr& type, int32_t parent_id) { - return VisitSchemaInline(*type, this, type, parent_id); + return VisitTypeCategory(*type, this, type, parent_id); } /// \brief Apply changes to a struct type @@ -65,6 +65,7 @@ class ApplyChangesVisitor { const std::shared_ptr& base_type, int32_t parent_id) { std::vector new_fields; + bool has_changes = false; // Process existing fields for (const auto& field : struct_type.fields()) { @@ -77,13 +78,23 @@ class ApplyChangesVisitor { ProcessField(field, field_type_result)); if (processed_field.has_value()) { - new_fields.push_back(std::move(processed_field.value())); + const auto& new_field = processed_field.value(); + new_fields.push_back(new_field); + + // Check if this field changed + if (new_field != field) { + has_changes = true; + } + } else { + // Field was deleted + has_changes = true; } } // Add new fields for this struct auto adds_it = parent_to_added_ids_.find(parent_id); - if (adds_it != parent_to_added_ids_.end()) { + if (adds_it != parent_to_added_ids_.end() && !adds_it->second.empty()) { + has_changes = true; for (int32_t added_id : adds_it->second) { auto added_field_it = updates_.find(added_id); if (added_field_it != updates_.end()) { @@ -92,6 +103,11 @@ class ApplyChangesVisitor { } } + // Return original type if nothing changed + if (!has_changes) { + return base_type; + } + return std::make_shared(std::move(new_fields)); } @@ -253,7 +269,7 @@ UpdateSchema::UpdateSchema(std::shared_ptr transaction) AddError(identifier_names_result.error()); return; } - identifier_field_names_ = identifier_names_result.value(); + identifier_field_names_ = std::move(identifier_names_result.value()); // Initialize id_to_parent map from the schema id_to_parent_ = IndexParents(*schema_); @@ -349,11 +365,8 @@ UpdateSchema& UpdateSchema::DeleteColumn(std::string_view name) { const auto& field = field_opt->get(); int32_t field_id = field.field_id(); - // Check the field doesn't have additions ICEBERG_BUILDER_CHECK(!parent_to_added_ids_.contains(field_id), "Cannot delete a column that has additions: {}", name); - - // Check the field doesn't have updates ICEBERG_BUILDER_CHECK(!updates_.contains(field_id), "Cannot delete a column that has updates: {}", name); @@ -405,7 +418,6 @@ Result UpdateSchema::Apply() { const auto& field = field_opt->get(); auto field_id = field.field_id(); - // Check the field itself is not deleted ICEBERG_CHECK(!deletes_.contains(field_id), "Cannot delete identifier field {}. To force deletion, also call " "SetIdentifierFields to update identifier fields.", @@ -415,35 +427,24 @@ Result UpdateSchema::Apply() { auto parent_it = id_to_parent_.find(field_id); while (parent_it != id_to_parent_.end()) { int32_t parent_id = parent_it->second; - ICEBERG_ASSIGN_OR_RAISE(auto parent_field_opt, schema_->FindFieldById(parent_id)); ICEBERG_CHECK( !deletes_.contains(parent_id), - "Cannot delete field {} as it will delete nested identifier field {}", - parent_field_opt.has_value() ? std::string(parent_field_opt->get().name()) - : std::to_string(parent_id), - name); + "Cannot delete field with id {} as it will delete nested identifier field {}", + parent_id, name); parent_it = id_to_parent_.find(parent_id); } } } - // Apply schema changes using visitor pattern - // Create a temporary struct type from the schema to use with the visitor - auto schema_struct_type = std::make_shared( - schema_->fields() | std::ranges::to>()); - // Apply changes recursively using the visitor ApplyChangesVisitor visitor(deletes_, updates_, parent_to_added_ids_); - ICEBERG_ASSIGN_OR_RAISE(auto new_type, - visitor.ApplyChanges(schema_struct_type, kTableRootId)); + ICEBERG_ASSIGN_OR_RAISE(auto new_type, visitor.ApplyChanges(schema_, kTableRootId)); // Cast result back to StructType and extract fields auto new_struct_type = internal::checked_pointer_cast(new_type); - std::vector new_fields(new_struct_type->fields() | - std::ranges::to>()); // Convert identifier field names to IDs - auto temp_schema = std::make_shared(new_fields); + auto temp_schema = new_struct_type->ToSchema(); std::vector fresh_identifier_ids; for (const auto& name : identifier_field_names_) { ICEBERG_ASSIGN_OR_RAISE(auto field_opt, @@ -456,6 +457,7 @@ Result UpdateSchema::Apply() { } // Create the new schema + auto new_fields = temp_schema->fields() | std::ranges::to>(); ICEBERG_ASSIGN_OR_RAISE( auto new_schema, Schema::Make(std::move(new_fields), schema_->schema_id(), fresh_identifier_ids)); @@ -464,6 +466,7 @@ Result UpdateSchema::Apply() { .new_last_column_id = last_column_id_}; } +// TODO(Guotao Yu): v3 default value is not yet supported UpdateSchema& UpdateSchema::AddColumnInternal(std::optional parent, std::string_view name, bool is_optional, std::shared_ptr type, @@ -472,7 +475,8 @@ UpdateSchema& UpdateSchema::AddColumnInternal(std::optional pa std::string full_name; // Handle parent field - if (parent.has_value() && !parent->empty()) { + if (parent.has_value()) { + ICEBERG_BUILDER_CHECK(!parent->empty(), "Parent name cannot be empty"); // Find parent field ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto parent_field_opt, FindField(*parent)); ICEBERG_BUILDER_CHECK(parent_field_opt.has_value(), "Cannot find parent struct: {}", @@ -483,22 +487,19 @@ UpdateSchema& UpdateSchema::AddColumnInternal(std::optional pa // Get the actual field to add to (handle map/list) const SchemaField* target_field = &parent_field; - if (parent_type->is_nested()) { - const auto& nested = internal::checked_cast(*parent_type); - if (nested.type_id() == TypeId::kMap) { - // For maps, add to value field - const auto& map_type = internal::checked_cast(nested); - target_field = &map_type.value(); - } else if (nested.type_id() == TypeId::kList) { - // For lists, add to element field - const auto& list_type = internal::checked_cast(nested); - target_field = &list_type.element(); - } + + if (parent_type->type_id() == TypeId::kMap) { + // For maps, add to value field + const auto& map_type = internal::checked_cast(*parent_type); + target_field = &map_type.value(); + } else if (parent_type->type_id() == TypeId::kList) { + // For lists, add to element field + const auto& list_type = internal::checked_cast(*parent_type); + target_field = &list_type.element(); } // Validate target is a struct - ICEBERG_BUILDER_CHECK(target_field->type()->is_nested() && - target_field->type()->type_id() == TypeId::kStruct, + ICEBERG_BUILDER_CHECK(target_field->type()->type_id() == TypeId::kStruct, "Cannot add to non-struct column: {}: {}", *parent, target_field->type()->ToString()); @@ -510,28 +511,25 @@ UpdateSchema& UpdateSchema::AddColumnInternal(std::optional pa // Check field doesn't already exist (unless it's being deleted) std::string nested_name = std::format("{}.{}", *parent, name); - ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field_opt, FindField(nested_name)); - if (current_field_opt.has_value()) { - const auto& current_field = current_field_opt->get(); - ICEBERG_BUILDER_CHECK(deletes_.contains(current_field.field_id()), - "Cannot add column, name already exists: {}.{}", *parent, - name); - } + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field, FindField(nested_name)); + ICEBERG_BUILDER_CHECK( + !current_field.has_value() || deletes_.contains(current_field->get().field_id()), + "Cannot add column, name already exists: {}.{}", *parent, name); // Build full name using canonical name of parent ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto parent_name_opt, schema_->FindColumnNameById(parent_id)); ICEBERG_BUILDER_CHECK(parent_name_opt.has_value(), "Cannot find column name for parent id: {}", parent_id); + full_name = std::format("{}.{}", *parent_name_opt, name); } else { // Top-level field - ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field_opt, FindField(name)); - if (current_field_opt.has_value()) { - const auto& current_field = current_field_opt->get(); - ICEBERG_BUILDER_CHECK(deletes_.contains(current_field.field_id()), - "Cannot add column, name already exists: {}", name); - } + ICEBERG_BUILDER_ASSIGN_OR_RETURN(auto current_field, FindField(name)); + ICEBERG_BUILDER_CHECK( + !current_field.has_value() || deletes_.contains(current_field->get().field_id()), + "Cannot add column, name already exists: {}", name); + full_name = std::string(name); } @@ -567,11 +565,7 @@ UpdateSchema& UpdateSchema::AddColumnInternal(std::optional pa return *this; } -int32_t UpdateSchema::AssignNewColumnId() { - int32_t next = last_column_id_ + 1; - last_column_id_ = next; - return next; -} +int32_t UpdateSchema::AssignNewColumnId() { return ++last_column_id_; } Result>> UpdateSchema::FindField( std::string_view name) const { diff --git a/src/iceberg/util/visit_type.h b/src/iceberg/util/visit_type.h index d81c36356..bf52d2e9a 100644 --- a/src/iceberg/util/visit_type.h +++ b/src/iceberg/util/visit_type.h @@ -124,7 +124,7 @@ inline Status VisitTypeIdInline(TypeId id, VISITOR* visitor, ARGS&&... args) { #undef TYPE_ID_VISIT_INLINE -/// \brief Visit a type using a schema visitor pattern +/// \brief Visit a type using a categorical visitor pattern /// /// This function provides a simplified visitor interface that groups Iceberg types into /// four categories based on their structural properties: @@ -149,13 +149,15 @@ inline Status VisitTypeIdInline(TypeId id, VISITOR* visitor, ARGS&&... args) { /// \param args Additional arguments forwarded to the Visit methods /// \return The return value from the invoked Visit method template -inline auto VisitSchemaInline(const Type& type, VISITOR* visitor, ARGS&&... args) { +inline auto VisitTypeCategory(const Type& type, VISITOR* visitor, ARGS&&... args) { #define SCHEMA_VISIT_ACTION(TYPE_CLASS) \ return visitor->Visit##TYPE_CLASS( \ internal::checked_cast(type), \ std::forward(args)...); - switch (type.type_id()) { ICEBERG_GENERATE_SCHEMA_VISITOR_CASES(SCHEMA_VISIT_ACTION) } + switch (type.type_id()) { + ICEBERG_TYPE_SWITCH_WITH_PRIMITIVE_DEFAULT(SCHEMA_VISIT_ACTION) + } #undef SCHEMA_VISIT_ACTION } diff --git a/src/iceberg/util/visitor_generate.h b/src/iceberg/util/visitor_generate.h index b4c6dc877..053371d41 100644 --- a/src/iceberg/util/visitor_generate.h +++ b/src/iceberg/util/visitor_generate.h @@ -47,14 +47,14 @@ namespace iceberg { /// - List types -> calls ACTION with List /// - Map types -> calls ACTION with Map /// - All primitive types (default) -> calls ACTION with Primitive -#define ICEBERG_GENERATE_SCHEMA_VISITOR_CASES(ACTION) \ - case ::iceberg::TypeId::kStruct: \ - ACTION(Struct) \ - case ::iceberg::TypeId::kList: \ - ACTION(List) \ - case ::iceberg::TypeId::kMap: \ - ACTION(Map) \ - default: \ +#define ICEBERG_TYPE_SWITCH_WITH_PRIMITIVE_DEFAULT(ACTION) \ + case ::iceberg::TypeId::kStruct: \ + ACTION(Struct) \ + case ::iceberg::TypeId::kList: \ + ACTION(List) \ + case ::iceberg::TypeId::kMap: \ + ACTION(Map) \ + default: \ ACTION(Primitive) } // namespace iceberg