Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {

private final WriteOptions optionSync;
private final WriteOptions optionDontSync;
private Cache cache;
private Cache blockCache;
private Cache rowCache;

private final ReadOptions optionCache;
private final ReadOptions optionDontCache;
Expand All @@ -101,6 +102,7 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {
private static final String ROCKSDB_BLOCK_SIZE = "dbStorage_rocksDB_blockSize";
private static final String ROCKSDB_BLOOM_FILTERS_BITS_PER_KEY = "dbStorage_rocksDB_bloomFilterBitsPerKey";
private static final String ROCKSDB_BLOCK_CACHE_SIZE = "dbStorage_rocksDB_blockCacheSize";
private static final String ROCKSDB_ROW_CACHE_SIZE = "dbStorage_rocksDB_rowCacheSize";
private static final String ROCKSDB_NUM_LEVELS = "dbStorage_rocksDB_numLevels";
private static final String ROCKSDB_NUM_FILES_IN_LEVEL0 = "dbStorage_rocksDB_numFilesInLevel0";
private static final String ROCKSDB_MAX_SIZE_IN_LEVEL1_MB = "dbStorage_rocksDB_maxSizeInLevel1MB";
Expand Down Expand Up @@ -164,6 +166,22 @@ private RocksDB initializeRocksDBWithConfFile(String basePath, String subPath, D
.setIgnoreUnknownOptions(false)
.setEnv(Env.getDefault())) {
OptionsUtil.loadOptionsFromFile(cfgOpts, dbFilePath, dbOptions, cfDescs);

// Configure row cache if enabled and this is for EntryLocation
if (dbConfigType == DbConfigType.EntryLocation) {
long rowCacheSizeBytes = DbLedgerStorage.getLongVariableOrDefault(conf, ROCKSDB_ROW_CACHE_SIZE, 0);
if (rowCacheSizeBytes > 0) {
this.rowCache = new LRUCache(rowCacheSizeBytes);
dbOptions.setRowCache(rowCache);
log.info("Enabled RocksDB row cache with size: {} bytes (config file mode)", rowCacheSizeBytes);
} else {
this.rowCache = null;
log.info("RocksDB row cache is disabled (config file mode)");
}
} else {
this.rowCache = null;
}

// Configure file path
String logPath = conf.getString(ROCKSDB_LOG_PATH, "");
if (!logPath.isEmpty()) {
Expand Down Expand Up @@ -199,6 +217,9 @@ private RocksDB initializeRocksDBWithBookieConf(String basePath, String subPath,
long blockCacheSize = DbLedgerStorage.getLongVariableOrDefault(conf, ROCKSDB_BLOCK_CACHE_SIZE,
defaultRocksDBBlockCacheSizeBytes);

/* Set default RocksDB row-cache size to 0 (disabled by default) */
long rowCacheSizeBytes = DbLedgerStorage.getLongVariableOrDefault(conf, ROCKSDB_ROW_CACHE_SIZE, 0);

long writeBufferSizeMB = conf.getInt(ROCKSDB_WRITE_BUFFER_SIZE_MB, 64);
long sstSizeMB = conf.getInt(ROCKSDB_SST_SIZE_MB, 64);
int numLevels = conf.getInt(ROCKSDB_NUM_LEVELS, -1);
Expand All @@ -225,10 +246,10 @@ private RocksDB initializeRocksDBWithBookieConf(String basePath, String subPath,
options.setTargetFileSizeBase(sstSizeMB * 1024 * 1024);
options.setDeleteObsoleteFilesPeriodMicros(TimeUnit.HOURS.toMicros(1));

this.cache = new LRUCache(blockCacheSize);
this.blockCache = new LRUCache(blockCacheSize);
BlockBasedTableConfig tableOptions = new BlockBasedTableConfig();
tableOptions.setBlockSize(blockSize);
tableOptions.setBlockCache(cache);
tableOptions.setBlockCache(blockCache);
tableOptions.setFormatVersion(formatVersion);
tableOptions.setChecksumType(checksumType);
if (bloomFilterBitsPerKey > 0) {
Expand All @@ -240,8 +261,19 @@ private RocksDB initializeRocksDBWithBookieConf(String basePath, String subPath,
options.setLevelCompactionDynamicLevelBytes(true);

options.setTableFormatConfig(tableOptions);

// Configure row cache if enabled
if (rowCacheSizeBytes > 0) {
this.rowCache = new LRUCache(rowCacheSizeBytes);
options.setRowCache(rowCache);
log.info("Enabled RocksDB row cache with size: {} bytes", rowCacheSizeBytes);
} else {
this.rowCache = null;
log.info("RocksDB row cache is disabled");
}
} else {
this.cache = null;
this.blockCache = null;
this.rowCache = null;
BlockBasedTableConfig tableOptions = new BlockBasedTableConfig();
tableOptions.setFormatVersion(formatVersion);
tableOptions.setChecksumType(checksumType);
Expand Down Expand Up @@ -301,8 +333,11 @@ public void close() throws IOException {
} finally {
closedLock.writeLock().unlock();
}
if (cache != null) {
cache.close();
if (blockCache != null) {
blockCache.close();
}
if (rowCache != null) {
rowCache.close();
}
if (options != null) {
options.close();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package org.apache.bookkeeper.bookie.storage.ldb;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;

import java.io.File;
import org.apache.bookkeeper.conf.ServerConfiguration;
import org.apache.commons.io.FileUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

/**
* Test RocksDB row cache configuration.
*/
public class RocksDBRowCacheTest {

private KeyValueStorageRocksDB storage;
private File tempDir;
private ServerConfiguration conf;

@Before
public void setup() throws Exception {
tempDir = File.createTempFile("test", "");
tempDir.delete();
tempDir.mkdir();

conf = new ServerConfiguration();
}

@After
public void tearDown() throws Exception {
if (storage != null) {
storage.close();
}
if (tempDir != null) {
FileUtils.deleteDirectory(tempDir);
}
}

@Test
public void testRowCacheEnabled() throws Exception {
// Set row cache size to 64MB
conf.setProperty("dbStorage_rocksDB_rowCacheSize", 64 * 1024 * 1024L);

storage = new KeyValueStorageRocksDB(
tempDir.getAbsolutePath(),
"test-db",
KeyValueStorageFactory.DbConfigType.EntryLocation,
conf
);

// Write some test data
byte[] key1 = "test-key-1".getBytes();
byte[] value1 = "test-value-1".getBytes();
byte[] key2 = "test-key-2".getBytes();
byte[] value2 = "test-value-2".getBytes();

storage.put(key1, value1);
storage.put(key2, value2);
storage.sync();

// Read back and verify
byte[] readValue1 = storage.get(key1);
byte[] readValue2 = storage.get(key2);

assertNotNull(readValue1);
assertNotNull(readValue2);
assertEquals("test-value-1", new String(readValue1));
assertEquals("test-value-2", new String(readValue2));

// Multiple reads should benefit from row cache
for (int i = 0; i < 100; i++) {
byte[] cachedRead = storage.get(key1);
assertEquals("test-value-1", new String(cachedRead));
}
}

@Test
public void testRowCacheDisabledByDefault() throws Exception {
// Don't set row cache size - should be disabled by default
storage = new KeyValueStorageRocksDB(
tempDir.getAbsolutePath(),
"test-db",
KeyValueStorageFactory.DbConfigType.EntryLocation,
conf
);

// Should still work without row cache
byte[] key = "test-key".getBytes();
byte[] value = "test-value".getBytes();

storage.put(key, value);
storage.sync();

byte[] readValue = storage.get(key);
assertNotNull(readValue);
assertEquals("test-value", new String(readValue));
}

@Test
public void testRowCacheWithLargeData() throws Exception {
// Set row cache size to 16MB
conf.setProperty("dbStorage_rocksDB_rowCacheSize", 16 * 1024 * 1024L);

storage = new KeyValueStorageRocksDB(
tempDir.getAbsolutePath(),
"test-db",
KeyValueStorageFactory.DbConfigType.EntryLocation,
conf
);

// Write larger values that would benefit from row cache
int numKeys = 1000;
byte[] largeValue = new byte[1024]; // 1KB per value

for (int i = 0; i < numKeys; i++) {
String key = String.format("key-%06d", i);
storage.put(key.getBytes(), largeValue);
}
storage.sync();

// Read hot keys multiple times - should be cached
for (int round = 0; round < 10; round++) {
for (int i = 0; i < 100; i++) { // Read first 100 keys repeatedly
String key = String.format("key-%06d", i);
byte[] value = storage.get(key.getBytes());
assertNotNull(value);
assertEquals(1024, value.length);
}
}
}
}
9 changes: 9 additions & 0 deletions conf/bk_server.conf
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,15 @@ gcEntryLogMetadataCacheEnabled=false
# database which can reach ~2GB in some cases
# Default is to use 10% / numberOfLedgers of the direct memory size
# dbStorage_rocksDB_blockCacheSize=

# Size of RocksDB row-cache. The row cache caches actual key-value pairs
# (unlike block cache which caches compressed/uncompressed blocks).
# This can significantly improve read performance for frequently accessed keys
# by avoiding SST file lookups, reducing read latency from milliseconds to microseconds.
# Default is 0 (disabled). A typical value would be 64-512MB depending on available memory.
# Note: Row cache uses additional memory on top of block cache.
# dbStorage_rocksDB_rowCacheSize=

# Other RocksDB specific tunables for the entry location database
# dbStorage_rocksDB_writeBufferSizeMB=64
# dbStorage_rocksDB_sstSizeInMB=64
Expand Down
39 changes: 39 additions & 0 deletions conf/bk_server_rocksdb_rowcache_example.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Example configuration for enabling RocksDB row cache in BookKeeper
# This configuration demonstrates how to enable row cache for improved read performance

# Use DbLedgerStorage (required for RocksDB features)
ledgerStorageClass=org.apache.bookkeeper.bookie.storage.ldb.DbLedgerStorage

# RocksDB Block Cache Configuration
# Block cache caches compressed/uncompressed blocks
# Default: 10% of direct memory / number of ledger directories
# dbStorage_rocksDB_blockCacheSize=536870912 # 512MB

# RocksDB Row Cache Configuration (NEW)
# Row cache stores actual key-value pairs for faster lookups
# This avoids SST file reads for frequently accessed keys
# Default: 0 (disabled)
# Recommended values: 64MB to 512MB depending on available memory
dbStorage_rocksDB_rowCacheSize=268435456 # 256MB

# Memory allocation example for a server with 8GB RAM:
# - Write cache: 25% of direct memory (~2GB)
# - Read-ahead cache: 25% of direct memory (~2GB)
# - Block cache: 10% of direct memory (~800MB)
# - Row cache: 256MB (configured above)
# - Total RocksDB memory usage: ~5GB

# Other RocksDB tuning parameters
dbStorage_rocksDB_writeBufferSizeMB=64
dbStorage_rocksDB_sstSizeInMB=64
dbStorage_rocksDB_blockSize=65536
dbStorage_rocksDB_bloomFilterBitsPerKey=10

# Enable compression for better storage efficiency
dbStorage_rocksDB_lz4CompressionEnabled=true

# Note: Row cache is most effective for:
# - Hot keys that are accessed frequently
# - Small to medium-sized values
# - Read-heavy workloads
# - Low-latency requirements (sub-millisecond reads)