apache · fightBoxing · Sep 16, 2025 · Sep 16, 2025 · Sep 17, 2025 · Sep 17, 2025
diff --git a/.github/workflows/paimon-python-checks.yml b/.github/workflows/paimon-python-checks.yml
@@ -32,6 +32,9 @@ on:
 
 env:
   PYTHON_VERSIONS: "['3.6.15', '3.10']"
+  JDK_VERSION: 8
+  MAVEN_OPTS: -Dmaven.wagon.httpconnectionManager.ttlSeconds=30 -Dmaven.wagon.http.retryHandler.requestSentEnabled=true
+
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }}
@@ -49,6 +52,17 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v2
 
+      - name: Set up JDK ${{ env.JDK_VERSION }}
+        uses: actions/setup-java@v4
+        with:
+          java-version: ${{ env.JDK_VERSION }}
+          distribution: 'temurin'
+
+      - name: Set up Maven
+        uses: stCarolas/setup-maven@v4.5
+        with:
+          maven-version: 3.8.8
+
       - name: Install system dependencies
         shell: bash
         run: |
@@ -58,19 +72,29 @@ jobs:
             curl \
             && rm -rf /var/lib/apt/lists/*
 
+      - name: Verify Java and Maven installation
+        run: |
+          java -version
+          mvn -version
+
       - name: Verify Python version
         run: python --version
 
+      - name: Build Java
+        run: |
+          echo "Start compiling modules"
+          mvn -T 2C -B clean install -DskipTests
+
       - name: Install Python dependencies
         shell: bash
         run: |
           if [[ "${{ matrix.python-version }}" == "3.6.15" ]]; then
             python -m pip install --upgrade pip==21.3.1
             python --version
-            python -m pip install -q readerwriterlock==1.0.9 'fsspec==2021.10.1' 'cachetools==4.2.4' 'ossfs==2021.8.0' pyarrow==6.0.1 pandas==1.1.5 'polars==0.9.12' 'fastavro==1.4.7' zstandard==0.19.0 dataclasses==0.8.0 flake8 pytest py4j==0.10.9.9 requests 2>&1 >/dev/null
+            python -m pip install -q readerwriterlock==1.0.9 'fsspec==2021.10.1' 'cachetools==4.2.4' 'ossfs==2021.8.0' pyarrow==6.0.1 pandas==1.1.5 'polars==0.9.12' 'fastavro==1.4.7' zstandard==0.19.0 dataclasses==0.8.0 flake8 pytest py4j==0.10.9.9 requests parameterized==0.8.1 2>&1 >/dev/null
           else
             python -m pip install --upgrade pip
-            python -m pip install -q readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests 2>&1 >/dev/null
+            python -m pip install -q readerwriterlock==1.0.9 fsspec==2024.3.1 cachetools==5.3.3 ossfs==2023.12.0 ray==2.48.0 fastavro==1.11.1 pyarrow==16.0.0 zstandard==0.24.0 polars==1.32.0 duckdb==1.3.2 numpy==1.24.3 pandas==2.0.3 flake8==4.0.1 pytest~=7.0 py4j==0.10.9.9 requests parameterized==0.9.0 2>&1 >/dev/null
           fi
       - name: Run lint-python.sh
         shell: bash

diff --git a/docs/config.toml b/docs/config.toml
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-baseURL = '//paimon.apache.org/docs/master'
+baseURL = '//paimon.apache.org/docs/1.3'
 languageCode = 'en-us'
 title = 'Apache Paimon'
 enableGitInfo = false
@@ -24,7 +24,7 @@ pygmentsUseClasses = true
 [params]
   # Flag whether this is a stable version or not.
   # Used for the quickstart page.
-  IsStable = false
+  IsStable = true
 
   # Flag to indicate whether an outdated warning should be shown.
   ShowOutDatedWarning = false
@@ -34,14 +34,14 @@ pygmentsUseClasses = true
   # we change the version for the complete docs when forking of a release branch
   # etc.
   # The full version string as referenced in Maven (e.g. 1.2.1)
-  Version = "1.3-SNAPSHOT"
+  Version = "1.3.0"
 
   # For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot
   # release this should be the same as the regular version
-  VersionTitle = "1.3-SNAPSHOT"
+  VersionTitle = "1.3"
 
   # The branch for this version of Apache Paimon
-  Branch = "master"
+  Branch = "1.3"
 
   # The most recent supported Apache Flink version
   FlinkVersion = "1.20"
@@ -67,14 +67,14 @@ pygmentsUseClasses = true
     ["JavaDocs", "//paimon.apache.org/docs/master/api/java/"],
   ]
 
-  StableDocs = "https://paimon.apache.org/docs/1.0"
+  StableDocs = "https://paimon.apache.org/docs/1.3"
 
   PreviousDocs = [
     ["master", "https://paimon.apache.org/docs/master"],
-    ["stable", "https://paimon.apache.org/docs/1.2"],
+    ["stable", "https://paimon.apache.org/docs/1.3"],
+    ["1.3", "https://paimon.apache.org/docs/1.3"],
     ["1.2", "https://paimon.apache.org/docs/1.2"],
     ["1.1", "https://paimon.apache.org/docs/1.1"],
-    ["1.0", "https://paimon.apache.org/docs/1.0"],
   ]
 
   BookSection = '/'

diff --git a/docs/content/append-table/incremental-clustering.md b/docs/content/append-table/incremental-clustering.md
@@ -0,0 +1,175 @@
+---
+title: "Incremental Clustering"
+weight: 4
+type: docs
+aliases:
+- /append-table/incremental-clustering.html
+---
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Incremental Clustering
+
+Paimon currently supports ordering append tables using SFC (Space-Filling Curve)(see [sort compact]({{< ref "maintenance/dedicated-compaction#sort-compact" >}}) for more info). 
+The resulting data layout typically delivers better performance for queries that target clustering keys. 
+However, with the current SortCompaction, even when neither the data nor the clustering keys have changed, 
+each run still rewrites the entire dataset, which is extremely costly. 
+
+To address this, Paimon introduced a more flexible, incremental clustering mechanism—Incremental Clustering. 
+On each run, it selects only a specific subset of files to cluster, avoiding a full rewrite. This enables low-cost, 
+sort-based optimization of the data layout and improves query performance. In addition, with Incremental Clustering, 
+you can adjust clustering keys without rewriting existing data, the layout evolves dynamically as cluster runs and 
+gradually converges to an optimal state, significantly reducing the decision-making complexity around data layout.
+
+
+Incremental Clustering supports:
+- Support incremental clustering; minimizing write amplification as possible.
+- Support small-file compaction; during rewrites, respect target-file-size.
+- Support changing clustering keys; newly ingested data is clustered according to the latest clustering keys.
+- Provide a full mode; when selected, the entire dataset will be reclustered.
+
+**Only append unaware-bucket table supports Incremental Clustering.**
+
+## Enable Incremental Clustering
+
+To enable Incremental Clustering, the following configuration needs to be set for the table:
+<table class="table table-bordered">
+    <thead>
+    <tr>
+      <th class="text-left" style="width: 20%">Option</th>
+      <th class="text-left" style="width: 10%">Value</th>
+      <th class="text-left" style="width: 5%">Required</th>
+      <th class="text-left" style="width: 10%">Type</th>
+      <th class="text-left" style="width: 55%">Description</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+      <td><h5>clustering.incremental</h5></td>
+      <td>true</td>
+      <td style="word-wrap: break-word;">Yes</td>
+      <td>Boolean</td>
+      <td>Must be set to true to enable incremental clustering. Default is false.</td>
+    </tr>
+    <tr>
+      <td><h5>clustering.columns</h5></td>
+      <td>'clustering-columns'</td>
+      <td style="word-wrap: break-word;">Yes</td>
+      <td>String</td>
+      <td>The clustering columns, in the format 'columnName1,columnName2'. It is not recommended to use partition keys as clustering keys.</td>
+    </tr>
+    <tr>
+      <td><h5>clustering.strategy</h5></td>
+      <td>'zorder' or 'hilbert' or 'order'</td>
+      <td style="word-wrap: break-word;">No</td>
+      <td>String</td>
+      <td>The ordering algorithm used for clustering. If not set, It'll decided from the number of clustering columns. 'order' is used for 1 column, 'zorder' for less than 5 columns, and 'hilbert' for 5 or more columns.</td>
+    </tr>
+    </tbody>
+
+</table>
+
+Once Incremental Clustering for a table is enabled, you can run Incremental Clustering in batch mode periodically 
+to continuously optimizes data layout of the table and deliver better query performance.
+
+**Note**: Since common compaction also rewrites files, it may disrupt the ordered data layout built by Incremental Clustering. 
+Therefore, when Incremental Clustering is enabled, the table no longer supports write-time compaction or dedicated compaction; 
+clustering and small-file merging must be performed exclusively via Incremental Clustering runs.
+
+## Run Incremental Clustering
+{{< hint info >}}
+
+only support running Incremental Clustering in batch mode.
+
+{{< /hint >}}
+
+To run a Incremental Clustering job, follow these instructions. 
+
+You don’t need to specify any clustering-related parameters when running Incremental Clustering,
+these options are already defined as table options. If you need to change clustering settings, please update the corresponding table options.
+
+{{< tabs "incremental-clustering" >}}
+
+{{< tab "Spark SQL" >}}
+
+Run the following sql:
+
+```sql
+--set the write parallelism, if too big, may generate a large number of small files.
+SET spark.sql.shuffle.partitions=10;
+
+-- run incremental clustering
+CALL sys.compact(table => 'T')
+
+-- run incremental clustering with full mode, this will recluster all data
+CALL sys.compact(table => 'T', compact_strategy => 'full')
+```
+{{< /tab >}}
+
+{{< tab "Flink Action" >}}
+
+Run the following command to submit a incremental clustering job for the table.
+
+```bash
+<FLINK_HOME>/bin/flink run \
+    /path/to/paimon-flink-action-{{< version >}}.jar \
+    compact \
+    --warehouse <warehouse-path> \
+    --database <database-name> \
+    --table <table-name> \
+    [--compact_strategy <minor / full>] \
+    [--table_conf <table_conf>] \
+    [--catalog_conf <paimon-catalog-conf> [--catalog_conf <paimon-catalog-conf> ...]]
+```
+
+Example: run incremental clustering
+
+```bash
+<FLINK_HOME>/bin/flink run \
+    /path/to/paimon-flink-action-{{< version >}}.jar \
+    compact \
+    --warehouse s3:///path/to/warehouse \
+    --database test_db \
+    --table test_table \
+    --table_conf sink.parallelism=2 \
+    --compact_strategy minor \
+    --catalog_conf s3.endpoint=https://****.com \
+    --catalog_conf s3.access-key=***** \
+    --catalog_conf s3.secret-key=*****
+```
+* `--compact_strategy` Determines how to pick files to be cluster, the default is `minor`.
+    * `full` : All files will be selected for clustered.
+    * `minor` : Pick the set of files that need to be clustered based on specified conditions.
+
+Note: write parallelism is set by `sink.parallelism`, if too big, may generate a large number of small files.
+
+You can use `-D execution.runtime-mode=batch` or `-yD execution.runtime-mode=batch` (for the ON-YARN scenario) to use batch mode.
+{{< /tab >}}
+
+{{< /tabs >}}
+
+## Implement
+To balance write amplification and sorting effectiveness, Paimon leverages the LSM Tree notion of levels to stratify data files 
+and uses the Universal Compaction strategy to select files for clustering.
+- Newly written data lands in level-0; files in level-0 are unclustered.
+- All files in level-i are produced by sorting within the same sorting set.
+- By analogy with Universal Compaction: in level-0, each file is a sorted run; in level-i, all files together constitute a single sorted run. During clustering, the sorted run is the basic unit of work.
+
+By introducing more levels, we can control the amount of data processed in each clustering run. 
+Data at higher levels is more stably clustered and less likely to be rewritten, thereby mitigating write amplification while maintaining good sorting effectiveness.
diff --git a/docs/content/append-table/row-tracking.md b/docs/content/append-table/row-tracking.md
@@ -26,9 +26,9 @@ under the License.
 
 # Row tracking
 
-Row tracking allows Paimon to track row-level lineage in a Paimon append table. Once enabled on a Paimon table, two more hidden columns will be added to the table schema:
-- `_ROW_ID`: BIGINT, this is a unique identifier for each row in the table. It is used to track the lineage of the row and can be used to identify the row in case of update, merge into or delete.
-- `_SEQUENCE_NUMBER`: BIGINT, this is field indicates which `version` of this record is. It actually is the snapshot-id of the snapshot that this row belongs to. It is used to track the lineage of the row version.
+Row tracking allows Paimon to track row-level tracking in a Paimon append table. Once enabled on a Paimon table, two more hidden columns will be added to the table schema:
+- `_ROW_ID`: BIGINT, this is a unique identifier for each row in the table. It is used to track the update of the row and can be used to identify the row in case of update, merge into or delete.
+- `_SEQUENCE_NUMBER`: BIGINT, this is field indicates which `version` of this record is. It actually is the snapshot-id of the snapshot that this row belongs to. It is used to track the update of the row version.
 
 Hidden columns follows the following rules:
 - Whenever we read from one table with row tracking enabled, the `_ROW_ID` and `_SEQUENCE_NUMBER` will be `NOT NULL`.
@@ -57,7 +57,7 @@ CREATE TABLE t (id INT, data STRING) TBLPROPERTIES ('row-tracking.enabled' = 'tr
 INSERT INTO t VALUES (11, 'a'), (22, 'b')
 ```
 
-You can select the row lineage meta column with the following sql in spark:
+You can select the row tracking meta column with the following sql in spark:
 ```sql
 SELECT id, data, _ROW_ID, _SEQUENCE_NUMBER FROM t;
 ```

diff --git a/docs/content/concepts/functions.md b/docs/content/concepts/functions.md
@@ -86,4 +86,4 @@ This statement deletes the existing `parse_str` function from the `mydb` databas
 
 ## Functions in Spark
 
-see [SQL Functions]({{< ref "spark/sql-functions#user-defined-function" >}})
+see [SQL Functions]({{< ref "spark/sql-functions#user-defined-function" >}})
diff --git a/docs/content/concepts/rest/dlf.md b/docs/content/concepts/rest/dlf.md
@@ -3,8 +3,9 @@ title: "DLF Token"
 weight: 3
 type: docs
 aliases:
-- /concepts/rest/dlf.html
+  - /concepts/rest/dlf.html
 ---
+
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
@@ -51,6 +52,13 @@ WITH (
 );
 ```
 
+- `uri`: Access the URI of the DLF Rest Catalog Server.
+- `warehouse`: DLF Catalog name
+- `token.provider`: token provider
+- `dlf.access-key-id`: The Access Key ID required to access the DLF service, usually referring to the AccessKey of your
+  RAM user
+- `dlf.access-key-secret`:The Access Key Secret required to access the DLF service
+
 You can grant specific permissions to a RAM user and use the RAM user's access key for long-term access to your DLF
 resources. Compared to using the Alibaba Cloud account access key, accessing DLF resources with a RAM user access key
 is more secure.

diff --git a/docs/content/concepts/rest/pvfs.md b/docs/content/concepts/rest/pvfs.md
@@ -128,17 +128,17 @@ Example: execute hadoop shell to list the virtual path
 
 ## Python SDK
 
-Python SDK provide fsspec style API, can be easily integrated to Python ecesystem.
+Python SDK provide fsspec style API, can be easily integrated to Python ecosystem.
 
 For example, Python code can do:
 
 ```python
 import pypaimon
 
 options = {
-"uri": 'key',
-'token.provider' = 'bear'
-'token' = '<token>'
+    'uri': 'key',
+    'token.provider': 'bear',
+    'token': '<token>'
 }
 fs = pypaimon.PaimonVirtualFileSystem(options)
 fs.ls("pvfs://catalog_name/database_name/table_name")
@@ -151,9 +151,9 @@ import pypaimon
 import pyarrow.parquet as pq
 
 options = {
-"uri": 'key',
-'token.provider' = 'bear'
-'token' = '<token>'
+    'uri': 'key',
+    'token.provider': 'bear',
+    'token': '<token>'
 }
 fs = pypaimon.PaimonVirtualFileSystem(options)
 path = 'pvfs://catalog_name/database_name/table_name/a.parquet'
@@ -169,9 +169,9 @@ import pypaimon
 import ray
 
 options = {
-"uri": 'key',
-'token.provider' = 'bear'
-'token' = '<token>'
+    'uri': 'key',
+    'token.provider': 'bear',
+    'token': '<token>'
 }
 fs = pypaimon.PaimonVirtualFileSystem(options)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -86,4 +86,4 @@ This statement deletes the existing `parse_str` function from the `mydb` databas

		## Functions in Spark

		see [SQL Functions]({{< ref "spark/sql-functions#user-defined-function" >}})
		see [SQL Functions]({{< ref "spark/sql-functions#user-defined-function" >}})