From b7fd1d58eaccfc7abc4564d3304b66ef28aad7a7 Mon Sep 17 00:00:00 2001 From: nmarasoiu Date: Mon, 29 Sep 2025 04:26:02 +0100 Subject: [PATCH] poc kv rocks rowCache optional --- .../storage/ldb/KeyValueStorageRocksDB.java | 47 +++++- .../storage/ldb/RocksDBRowCacheTest.java | 153 ++++++++++++++++++ conf/bk_server.conf | 9 ++ conf/bk_server_rocksdb_rowcache_example.conf | 39 +++++ 4 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/storage/ldb/RocksDBRowCacheTest.java create mode 100644 conf/bk_server_rocksdb_rowcache_example.conf diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java index 1caea81be2c..a34b9694d1e 100644 --- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java +++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java @@ -84,7 +84,8 @@ public class KeyValueStorageRocksDB implements KeyValueStorage { private final WriteOptions optionSync; private final WriteOptions optionDontSync; - private Cache cache; + private Cache blockCache; + private Cache rowCache; private final ReadOptions optionCache; private final ReadOptions optionDontCache; @@ -101,6 +102,7 @@ public class KeyValueStorageRocksDB implements KeyValueStorage { private static final String ROCKSDB_BLOCK_SIZE = "dbStorage_rocksDB_blockSize"; private static final String ROCKSDB_BLOOM_FILTERS_BITS_PER_KEY = "dbStorage_rocksDB_bloomFilterBitsPerKey"; private static final String ROCKSDB_BLOCK_CACHE_SIZE = "dbStorage_rocksDB_blockCacheSize"; + private static final String ROCKSDB_ROW_CACHE_SIZE = "dbStorage_rocksDB_rowCacheSize"; private static final String ROCKSDB_NUM_LEVELS = "dbStorage_rocksDB_numLevels"; private static final String ROCKSDB_NUM_FILES_IN_LEVEL0 = "dbStorage_rocksDB_numFilesInLevel0"; private static final String ROCKSDB_MAX_SIZE_IN_LEVEL1_MB = "dbStorage_rocksDB_maxSizeInLevel1MB"; @@ -164,6 +166,22 @@ private RocksDB initializeRocksDBWithConfFile(String basePath, String subPath, D .setIgnoreUnknownOptions(false) .setEnv(Env.getDefault())) { OptionsUtil.loadOptionsFromFile(cfgOpts, dbFilePath, dbOptions, cfDescs); + + // Configure row cache if enabled and this is for EntryLocation + if (dbConfigType == DbConfigType.EntryLocation) { + long rowCacheSizeBytes = DbLedgerStorage.getLongVariableOrDefault(conf, ROCKSDB_ROW_CACHE_SIZE, 0); + if (rowCacheSizeBytes > 0) { + this.rowCache = new LRUCache(rowCacheSizeBytes); + dbOptions.setRowCache(rowCache); + log.info("Enabled RocksDB row cache with size: {} bytes (config file mode)", rowCacheSizeBytes); + } else { + this.rowCache = null; + log.info("RocksDB row cache is disabled (config file mode)"); + } + } else { + this.rowCache = null; + } + // Configure file path String logPath = conf.getString(ROCKSDB_LOG_PATH, ""); if (!logPath.isEmpty()) { @@ -199,6 +217,9 @@ private RocksDB initializeRocksDBWithBookieConf(String basePath, String subPath, long blockCacheSize = DbLedgerStorage.getLongVariableOrDefault(conf, ROCKSDB_BLOCK_CACHE_SIZE, defaultRocksDBBlockCacheSizeBytes); + /* Set default RocksDB row-cache size to 0 (disabled by default) */ + long rowCacheSizeBytes = DbLedgerStorage.getLongVariableOrDefault(conf, ROCKSDB_ROW_CACHE_SIZE, 0); + long writeBufferSizeMB = conf.getInt(ROCKSDB_WRITE_BUFFER_SIZE_MB, 64); long sstSizeMB = conf.getInt(ROCKSDB_SST_SIZE_MB, 64); int numLevels = conf.getInt(ROCKSDB_NUM_LEVELS, -1); @@ -225,10 +246,10 @@ private RocksDB initializeRocksDBWithBookieConf(String basePath, String subPath, options.setTargetFileSizeBase(sstSizeMB * 1024 * 1024); options.setDeleteObsoleteFilesPeriodMicros(TimeUnit.HOURS.toMicros(1)); - this.cache = new LRUCache(blockCacheSize); + this.blockCache = new LRUCache(blockCacheSize); BlockBasedTableConfig tableOptions = new BlockBasedTableConfig(); tableOptions.setBlockSize(blockSize); - tableOptions.setBlockCache(cache); + tableOptions.setBlockCache(blockCache); tableOptions.setFormatVersion(formatVersion); tableOptions.setChecksumType(checksumType); if (bloomFilterBitsPerKey > 0) { @@ -240,8 +261,19 @@ private RocksDB initializeRocksDBWithBookieConf(String basePath, String subPath, options.setLevelCompactionDynamicLevelBytes(true); options.setTableFormatConfig(tableOptions); + + // Configure row cache if enabled + if (rowCacheSizeBytes > 0) { + this.rowCache = new LRUCache(rowCacheSizeBytes); + options.setRowCache(rowCache); + log.info("Enabled RocksDB row cache with size: {} bytes", rowCacheSizeBytes); + } else { + this.rowCache = null; + log.info("RocksDB row cache is disabled"); + } } else { - this.cache = null; + this.blockCache = null; + this.rowCache = null; BlockBasedTableConfig tableOptions = new BlockBasedTableConfig(); tableOptions.setFormatVersion(formatVersion); tableOptions.setChecksumType(checksumType); @@ -301,8 +333,11 @@ public void close() throws IOException { } finally { closedLock.writeLock().unlock(); } - if (cache != null) { - cache.close(); + if (blockCache != null) { + blockCache.close(); + } + if (rowCache != null) { + rowCache.close(); } if (options != null) { options.close(); diff --git a/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/storage/ldb/RocksDBRowCacheTest.java b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/storage/ldb/RocksDBRowCacheTest.java new file mode 100644 index 00000000000..0a11c1fe06a --- /dev/null +++ b/bookkeeper-server/src/test/java/org/apache/bookkeeper/bookie/storage/ldb/RocksDBRowCacheTest.java @@ -0,0 +1,153 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ +package org.apache.bookkeeper.bookie.storage.ldb; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +import java.io.File; +import org.apache.bookkeeper.conf.ServerConfiguration; +import org.apache.commons.io.FileUtils; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Test RocksDB row cache configuration. + */ +public class RocksDBRowCacheTest { + + private KeyValueStorageRocksDB storage; + private File tempDir; + private ServerConfiguration conf; + + @Before + public void setup() throws Exception { + tempDir = File.createTempFile("test", ""); + tempDir.delete(); + tempDir.mkdir(); + + conf = new ServerConfiguration(); + } + + @After + public void tearDown() throws Exception { + if (storage != null) { + storage.close(); + } + if (tempDir != null) { + FileUtils.deleteDirectory(tempDir); + } + } + + @Test + public void testRowCacheEnabled() throws Exception { + // Set row cache size to 64MB + conf.setProperty("dbStorage_rocksDB_rowCacheSize", 64 * 1024 * 1024L); + + storage = new KeyValueStorageRocksDB( + tempDir.getAbsolutePath(), + "test-db", + KeyValueStorageFactory.DbConfigType.EntryLocation, + conf + ); + + // Write some test data + byte[] key1 = "test-key-1".getBytes(); + byte[] value1 = "test-value-1".getBytes(); + byte[] key2 = "test-key-2".getBytes(); + byte[] value2 = "test-value-2".getBytes(); + + storage.put(key1, value1); + storage.put(key2, value2); + storage.sync(); + + // Read back and verify + byte[] readValue1 = storage.get(key1); + byte[] readValue2 = storage.get(key2); + + assertNotNull(readValue1); + assertNotNull(readValue2); + assertEquals("test-value-1", new String(readValue1)); + assertEquals("test-value-2", new String(readValue2)); + + // Multiple reads should benefit from row cache + for (int i = 0; i < 100; i++) { + byte[] cachedRead = storage.get(key1); + assertEquals("test-value-1", new String(cachedRead)); + } + } + + @Test + public void testRowCacheDisabledByDefault() throws Exception { + // Don't set row cache size - should be disabled by default + storage = new KeyValueStorageRocksDB( + tempDir.getAbsolutePath(), + "test-db", + KeyValueStorageFactory.DbConfigType.EntryLocation, + conf + ); + + // Should still work without row cache + byte[] key = "test-key".getBytes(); + byte[] value = "test-value".getBytes(); + + storage.put(key, value); + storage.sync(); + + byte[] readValue = storage.get(key); + assertNotNull(readValue); + assertEquals("test-value", new String(readValue)); + } + + @Test + public void testRowCacheWithLargeData() throws Exception { + // Set row cache size to 16MB + conf.setProperty("dbStorage_rocksDB_rowCacheSize", 16 * 1024 * 1024L); + + storage = new KeyValueStorageRocksDB( + tempDir.getAbsolutePath(), + "test-db", + KeyValueStorageFactory.DbConfigType.EntryLocation, + conf + ); + + // Write larger values that would benefit from row cache + int numKeys = 1000; + byte[] largeValue = new byte[1024]; // 1KB per value + + for (int i = 0; i < numKeys; i++) { + String key = String.format("key-%06d", i); + storage.put(key.getBytes(), largeValue); + } + storage.sync(); + + // Read hot keys multiple times - should be cached + for (int round = 0; round < 10; round++) { + for (int i = 0; i < 100; i++) { // Read first 100 keys repeatedly + String key = String.format("key-%06d", i); + byte[] value = storage.get(key.getBytes()); + assertNotNull(value); + assertEquals(1024, value.length); + } + } + } +} diff --git a/conf/bk_server.conf b/conf/bk_server.conf index d680e750cd7..5641af48936 100644 --- a/conf/bk_server.conf +++ b/conf/bk_server.conf @@ -790,6 +790,15 @@ gcEntryLogMetadataCacheEnabled=false # database which can reach ~2GB in some cases # Default is to use 10% / numberOfLedgers of the direct memory size # dbStorage_rocksDB_blockCacheSize= + +# Size of RocksDB row-cache. The row cache caches actual key-value pairs +# (unlike block cache which caches compressed/uncompressed blocks). +# This can significantly improve read performance for frequently accessed keys +# by avoiding SST file lookups, reducing read latency from milliseconds to microseconds. +# Default is 0 (disabled). A typical value would be 64-512MB depending on available memory. +# Note: Row cache uses additional memory on top of block cache. +# dbStorage_rocksDB_rowCacheSize= + # Other RocksDB specific tunables for the entry location database # dbStorage_rocksDB_writeBufferSizeMB=64 # dbStorage_rocksDB_sstSizeInMB=64 diff --git a/conf/bk_server_rocksdb_rowcache_example.conf b/conf/bk_server_rocksdb_rowcache_example.conf new file mode 100644 index 00000000000..9cc4d299485 --- /dev/null +++ b/conf/bk_server_rocksdb_rowcache_example.conf @@ -0,0 +1,39 @@ +# Example configuration for enabling RocksDB row cache in BookKeeper +# This configuration demonstrates how to enable row cache for improved read performance + +# Use DbLedgerStorage (required for RocksDB features) +ledgerStorageClass=org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage + +# RocksDB Block Cache Configuration +# Block cache caches compressed/uncompressed blocks +# Default: 10% of direct memory / number of ledger directories +# dbStorage_rocksDB_blockCacheSize=536870912 # 512MB + +# RocksDB Row Cache Configuration (NEW) +# Row cache stores actual key-value pairs for faster lookups +# This avoids SST file reads for frequently accessed keys +# Default: 0 (disabled) +# Recommended values: 64MB to 512MB depending on available memory +dbStorage_rocksDB_rowCacheSize=268435456 # 256MB + +# Memory allocation example for a server with 8GB RAM: +# - Write cache: 25% of direct memory (~2GB) +# - Read-ahead cache: 25% of direct memory (~2GB) +# - Block cache: 10% of direct memory (~800MB) +# - Row cache: 256MB (configured above) +# - Total RocksDB memory usage: ~5GB + +# Other RocksDB tuning parameters +dbStorage_rocksDB_writeBufferSizeMB=64 +dbStorage_rocksDB_sstSizeInMB=64 +dbStorage_rocksDB_blockSize=65536 +dbStorage_rocksDB_bloomFilterBitsPerKey=10 + +# Enable compression for better storage efficiency +dbStorage_rocksDB_lz4CompressionEnabled=true + +# Note: Row cache is most effective for: +# - Hot keys that are accessed frequently +# - Small to medium-sized values +# - Read-heavy workloads +# - Low-latency requirements (sub-millisecond reads) \ No newline at end of file