From e8fa5bfe3202cc60341b01fd5416227542dfb46b Mon Sep 17 00:00:00 2001
From: Graham Hukill <ghukill@gmail.com>
Date: Mon, 22 Dec 2025 10:06:21 -0500
Subject: [PATCH] Incremental table read error handling

Why these changes are being introduced:

There are a copule of primary ways in which a read method can fail based on the table requested:
1. The table name is invalid and will never work.
2. The table name is valid, but does not yet exist in the DuckDB context.

For the first, we want to raise an error immediately.  For the second, there is a bit more nuance depending
on the table requested.

How this addresses that need:

For TIMDEXDataset.read_* methods, we operate from the assumption that `records` and `current_records` should always be available and so we raise an error indicating a metadata rebuild is required.

But for TIMDEXEmbeddings, and potentially other data sources as added, we may legitimately not have data yet.
As such, we'll want to log warnings and suggest a refresh, but just return an empty set.

Side effects of this change:
* Applications like TIM, which now attempt embeddings reading for its `reindex-source` CLI command, will
no longer encounter an error if embeddings don't yet exist.
* In the rare edge cases of a brand new dataset, we have better error raising and logging.

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/USE-306
---
 tests/test_embeddings.py         | 39 +++++++++++++++++++++-----------
 tests/test_read.py               | 27 +++++++++++++++++++++-
 timdex_dataset_api/__init__.py   |  2 +-
 timdex_dataset_api/dataset.py    | 29 +++++++++++++++++-------
 timdex_dataset_api/embeddings.py | 13 +++++++++++
 5 files changed, 87 insertions(+), 23 deletions(-)

diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
index fef7fc5..62364d5 100644
--- a/tests/test_embeddings.py
+++ b/tests/test_embeddings.py
@@ -10,7 +10,6 @@
 import pytest
 
 from tests.utils import generate_sample_embeddings_for_run
-from timdex_dataset_api import TIMDEXDataset
 from timdex_dataset_api.embeddings import (
     METADATA_SELECT_FILTER_COLUMNS,
     TIMDEX_DATASET_EMBEDDINGS_SCHEMA,
@@ -302,9 +301,7 @@ def test_current_embeddings_view_single_run(timdex_dataset_for_embeddings_views)
 
     # write embeddings for run "apple-1"
     td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="apple-1"))
-
-    # NOTE: at time of test creation, this manual reload is required
-    td = TIMDEXDataset(td.location)
+    td.refresh()
 
     # query current_embeddings for apple source using read_dataframe
     result = td.embeddings.read_dataframe(table="current_embeddings", source="apple")
@@ -320,9 +317,7 @@ def test_current_embeddings_view_multiple_runs(timdex_dataset_for_embeddings_vie
     # write embeddings for runs "orange-1" and "orange-2"
     td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="orange-1"))
     td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="orange-2"))
-
-    # NOTE: at time of test creation, this manual reload is required
-    td = TIMDEXDataset(td.location)
+    td.refresh()
 
     # query current_embeddings for orange source using read_dataframe
     result = td.embeddings.read_dataframe(table="current_embeddings", source="orange")
@@ -363,9 +358,7 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
             td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
         )
     )
-
-    # NOTE: at time of test creation, this manual reload is required
-    td = TIMDEXDataset(td.location)
+    td.refresh()
 
     # check all embeddings for lemon-2 to verify both writes exist
     all_lemon_2 = td.embeddings.read_dataframe(table="embeddings", run_id="lemon-2")
@@ -416,9 +409,7 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
             td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
         )
     )
-
-    # NOTE: at time of test creation, this manual reload is required
-    td = TIMDEXDataset(td.location)
+    td.refresh()
 
     # query all embeddings for lemon source
     result = td.embeddings.read_dataframe(table="embeddings", source="lemon")
@@ -435,3 +426,25 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
     lemon_2_embeddings = result[result["run_id"] == "lemon-2"]
     assert len(lemon_2_embeddings) == 10  # 5 from each write
     assert (lemon_2_embeddings["run_date"] == date(2025, 8, 2)).all()
+
+
+def test_embeddings_read_batches_iter_returns_empty_when_embeddings_missing(
+    timdex_dataset_empty, caplog
+):
+    result = list(timdex_dataset_empty.embeddings.read_batches_iter())
+    assert result == []
+    assert (
+        "Table 'embeddings' not found in DuckDB context.  Embeddings may not yet exist "
+        "or TIMDEXDataset.refresh() may be required." in caplog.text
+    )
+
+
+def test_embeddings_read_batches_iter_returns_empty_for_invalid_table(
+    timdex_embeddings_with_runs, caplog
+):
+    """read_batches_iter returns empty iterator for nonexistent table name."""
+    with pytest.raises(
+        ValueError,
+        match="Invalid table: 'nonexistent'",
+    ):
+        list(timdex_embeddings_with_runs.read_batches_iter(table="nonexistent"))
diff --git a/tests/test_read.py b/tests/test_read.py
index 9fb8c0c..9941522 100644
--- a/tests/test_read.py
+++ b/tests/test_read.py
@@ -1,5 +1,5 @@
 # ruff: noqa: D205, D209, PLR2004
-
+import re
 
 import pandas as pd
 import pyarrow as pa
@@ -280,3 +280,28 @@ def test_read_batches_iter_limit_returns_n_rows(timdex_dataset_multi_source):
     batches = timdex_dataset_multi_source.read_batches_iter(limit=10)
     table = pa.Table.from_batches(batches)
     assert len(table) == 10
+
+
+def test_read_batches_iter_returns_empty_when_metadata_missing(
+    timdex_dataset_empty, caplog
+):
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "Table 'records' not found in DuckDB context.  If this is a new dataset, "
+            "either records do not yet exist or a "
+            "TIMDEXDataset.metadata.rebuild_dataset_metadata() may be required."
+        ),
+    ):
+        list(timdex_dataset_empty.read_batches_iter())
+
+
+def test_read_batches_iter_returns_empty_for_invalid_table(
+    timdex_dataset_multi_source, caplog
+):
+    """read_batches_iter returns empty iterator for nonexistent table name."""
+    with pytest.raises(
+        ValueError,
+        match="Invalid table: 'nonexistent'",
+    ):
+        list(timdex_dataset_multi_source.read_batches_iter(table="nonexistent"))
diff --git a/timdex_dataset_api/__init__.py b/timdex_dataset_api/__init__.py
index fb35f27..de45ca2 100644
--- a/timdex_dataset_api/__init__.py
+++ b/timdex_dataset_api/__init__.py
@@ -5,7 +5,7 @@
 from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
 from timdex_dataset_api.record import DatasetRecord
 
-__version__ = "3.9.0"
+__version__ = "3.10.0"
 
 __all__ = [
     "DatasetEmbedding",
diff --git a/timdex_dataset_api/dataset.py b/timdex_dataset_api/dataset.py
index 71f17df..016e7cd 100644
--- a/timdex_dataset_api/dataset.py
+++ b/timdex_dataset_api/dataset.py
@@ -443,17 +443,30 @@ def read_batches_iter(
         """
         start_time = time.perf_counter()
 
+        # ensure valid table
+        if table not in ["records", "current_records"]:
+            raise ValueError(f"Invalid table: '{table}'")
+
+        # ensure table exists
+        try:
+            self.get_sa_table("metadata", table)
+        except ValueError as exc:
+            raise ValueError(
+                f"Table '{table}' not found in DuckDB context.  If this is a new "
+                "dataset, either records do not yet exist or a "
+                "TIMDEXDataset.metadata.rebuild_dataset_metadata() may be required."
+            ) from exc
+
         temp_table_name = "read_meta_chunk"
         total_yield_count = 0
 
-        for i, meta_chunk_df in enumerate(
-            self._iter_meta_chunks(
-                table,
-                limit=limit,
-                where=where,
-                **filters,
-            )
-        ):
+        meta_chunks = self._iter_meta_chunks(
+            table,
+            limit=limit,
+            where=where,
+            **filters,
+        )
+        for i, meta_chunk_df in enumerate(meta_chunks):
             batch_time = time.perf_counter()
             batch_yield_count = len(meta_chunk_df)
             total_yield_count += batch_yield_count
diff --git a/timdex_dataset_api/embeddings.py b/timdex_dataset_api/embeddings.py
index 92a1465..4da3177 100644
--- a/timdex_dataset_api/embeddings.py
+++ b/timdex_dataset_api/embeddings.py
@@ -358,6 +358,19 @@ def read_batches_iter(
         """
         start_time = time.perf_counter()
 
+        if table not in ["embeddings", "current_embeddings", "current_run_embeddings"]:
+            raise ValueError(f"Invalid table: '{table}'")
+
+        # ensure table exists
+        try:
+            self.timdex_dataset.get_sa_table("data", table)
+        except ValueError:
+            logger.warning(
+                f"Table '{table}' not found in DuckDB context.  Embeddings may not yet "
+                "exist or TIMDEXDataset.refresh() may be required."
+            )
+            return
+
         data_query = self._build_query(
             table,
             columns,