From 2a837c91a60ee3dde03354ae215c36afbf3de0a3 Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Fri, 19 Dec 2025 17:24:17 -0600
Subject: [PATCH 1/7] add reference tables function

---
 dataretrieval/waterdata/__init__.py |  2 +
 dataretrieval/waterdata/api.py      | 60 ++++++++++++++++++++++++++++-
 dataretrieval/waterdata/types.py    | 21 ++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py
index 39b758f7..30659580 100644
--- a/dataretrieval/waterdata/__init__.py
+++ b/dataretrieval/waterdata/__init__.py
@@ -19,6 +19,7 @@
     get_latest_continuous,
     get_latest_daily,
     get_monitoring_locations,
+    get_reference_table,
     get_samples,
     get_time_series_metadata,
 )
@@ -37,6 +38,7 @@
     "get_latest_continuous",
     "get_latest_daily",
     "get_monitoring_locations",
+    "get_reference_table",
     "get_samples",
     "get_time_series_metadata",
     "_check_profiles",
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index 63f7b819..26f0b45e 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -16,11 +16,17 @@
 from dataretrieval.utils import BaseMetadata, to_str
 from dataretrieval.waterdata.types import (
     CODE_SERVICES,
+    METADATA_COLLECTIONS,
     PROFILE_LOOKUP,
     PROFILES,
     SERVICES,
 )
-from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data
+from dataretrieval.waterdata.utils import (
+    SAMPLES_URL,
+    get_ogc_data,
+    _construct_api_requests,
+    _walk_pages
+)
 
 # Set up logger for this module
 logger = logging.getLogger(__name__)
@@ -1388,6 +1394,58 @@ def get_field_measurements(
 
     return get_ogc_data(args, output_id, service)
 
+def get_reference_table(
+        collection: str,
+        limit: Optional[int] = None,
+        ) -> Tuple[pd.DataFrame, BaseMetadata]:
+    """Get metadata reference tables for the USGS Water Data API.
+
+    Reference tables provide the range of allowable values for parameter
+    arguments in the waterdata module. 
+
+    Parameters
+    ----------
+    collection : string
+        One of the following options: "agency-codes", "altitude-datums",
+        "aquifer-codes", "aquifer-types", "coordinate-accuracy-codes",
+        "coordinate-datum-codes", "coordinate-method-codes", "counties",
+        "hydrologic-unit-codes", "medium-codes", "national-aquifer-codes",
+        "parameter-codes", "reliability-codes", "site-types", "states",
+        "statistic-codes", "topographic-codes", "time-zone-codes"
+    limit : numeric, optional
+        The optional limit parameter is used to control the subset of the
+        selected features that should be returned in each page. The maximum
+        allowable limit is 50000. It may be beneficial to set this number lower
+        if your internet connection is spotty. The default (None) will set the
+        limit to the maximum allowable limit for the service.
+    """
+    valid_code_services = get_args(METADATA_COLLECTIONS)
+    if collection not in valid_code_services:
+        raise ValueError(
+            f"Invalid code service: '{collection}'. "
+            f"Valid options are: {valid_code_services}."
+        )
+    
+    req = _construct_api_requests(
+        service=collection,
+        limit=limit,
+        skip_geometry=True,
+    )
+    # Run API request and iterate through pages if needed
+    return_list, response = _walk_pages(
+        geopd=False, req=req
+    )
+
+    # Give ID column a more meaningful name
+    if collection.endswith("s"):
+        return_list = return_list.rename(columns={"id": f"{collection[:-1].replace("-", "_")}_id"})
+    else:
+        return_list = return_list.rename(columns={"id": f"{collection.replace("-", "_")}_id"})
+
+    # Create metadata object from response
+    metadata = BaseMetadata(response)
+    return return_list, metadata
+
 
 def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame:
     """Return codes from a Samples code service.
diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py
index 65e73394..f5e1496b 100644
--- a/dataretrieval/waterdata/types.py
+++ b/dataretrieval/waterdata/types.py
@@ -11,6 +11,27 @@
     "states",
 ]
 
+METADATA_COLLECTIONS = Literal[
+    "agency-codes",
+    "altitude-datums",
+    "aquifer-codes",
+    "aquifer-types",
+    "coordinate-accuracy-codes",
+    "coordinate-datum-codes",
+    "coordinate-method-codes",
+    "counties",
+    "hydrologic-unit-codes",
+    "medium-codes",
+    "national-aquifer-codes",
+    "parameter-codes",
+    "reliability-codes",
+    "site-types",
+    "states",
+    "statistic-codes",
+    "topographic-codes",
+    "time-zone-codes",
+]
+
 SERVICES = Literal[
     "activities",
     "locations",

From 5b42f0d17f248ccbb4ed99915ba710ee128a131e Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Mon, 22 Dec 2025 10:00:23 -0600
Subject: [PATCH 2/7] add unit tests for ref tables and change warning to info

---
 dataretrieval/waterdata/utils.py |  2 +-
 tests/waterdata_test.py          | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 46d58b62..4ad8f1c8 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -547,7 +547,7 @@ def _walk_pages(
     logger.info("Requesting: %s", req.url)
 
     if not geopd:
-        logger.warning(
+        logger.info(
             "Geopandas not installed. Geometries will be flattened into pandas DataFrames."
         )
 
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index 096a50ae..353e41fa 100755
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -16,6 +16,7 @@
     get_latest_daily,
     get_field_measurements,
     get_time_series_metadata,
+    get_reference_table
 )
 
 def mock_request(requests_mock, request_url, file_path):
@@ -227,4 +228,14 @@ def test_get_time_series_metadata():
     assert hasattr(md, 'url')
     assert hasattr(md, 'query_time')
 
+def test_get_reference_table():
+    df, md = get_reference_table("agency-codes")
+    assert "agency_code_id" in df.columns
+    assert df.shape[0] > 0
+    assert hasattr(md, 'url')
+    assert hasattr(md, 'query_time')
+
+def test_get_reference_table_wrong_name():
+    with pytest.raises(ValueError):
+        get_reference_table("agency-cod")
 

From cf0f4d734762451ccdae28d98d3267373f7686dc Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Mon, 22 Dec 2025 11:05:13 -0600
Subject: [PATCH 3/7] change column ordering logic a little in case someone
 requests id

---
 dataretrieval/waterdata/utils.py | 37 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 4ad8f1c8..0d0703a5 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -648,35 +648,34 @@ def _arrange_cols(
     pd.DataFrame or gpd.GeoDataFrame
         The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id.
     """
+
+    # Rename id column to output_id
+    df = df.rename(columns={"id": output_id})
+
+    # If properties are provided, filter to only those columns
     if properties and not all(pd.isna(properties)):
-        if "id" not in properties:
-            # If user refers to service-specific output id in properties,
-            # then rename the "id" column to the output_id (id column is
-            # automatically included).
-            if output_id in properties:
-                df = df.rename(columns={"id": output_id})
-            # If output id is not in properties, but user requests the plural
-            # of the output_id (e.g. "monitoring_locations_id"), then rename
-            # "id" to plural. This is pretty niche.
-            else:
-                plural = output_id.replace("_id", "s_id")
-                if plural in properties:
-                    df = df.rename(columns={"id": plural})
+        # id is technically a valid column from the service, but these
+        # functions make the name more specific. So, if someone requests
+        # 'id', give them the output_id column
+        if 'id' in properties:
+            properties[properties.index('id')] = output_id
         df = df.loc[:, [col for col in properties if col in df.columns]]
-    else:
-        df = df.rename(columns={"id": output_id})
-    
+
     # Move meaningless-to-user, extra id columns to the end
     # of the dataframe, if they exist
-    extra_id_cols = set(df.columns).intersection({
+    extra_id_col = set(df.columns).intersection({
         "latest_continuous_id",
         "latest_daily_id",
         "daily_id",
         "continuous_id",
         "field_measurement_id"
         })
-    if extra_id_cols:
-        id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols)
+
+    # If the arbitrary id column is returned (either due to properties
+    # being none or NaN), then move it to the end of the dataframe, but
+    # if part of properties, keep in requested order
+    if extra_id_col and properties is None or all(pd.isna(properties)):
+        id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col)
         df = df.loc[:, id_col_order]
     
     return df

From eb2b5d3270bc6f2db65c54a65b320d56dc72dc4a Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Mon, 22 Dec 2025 12:49:47 -0600
Subject: [PATCH 4/7] fix flake8

---
 dataretrieval/waterdata/api.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index 26f0b45e..3d714c6c 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -1438,9 +1438,13 @@ def get_reference_table(
 
     # Give ID column a more meaningful name
     if collection.endswith("s"):
-        return_list = return_list.rename(columns={"id": f"{collection[:-1].replace("-", "_")}_id"})
+        return_list = return_list.rename(
+            columns={"id": f"{collection[:-1].replace("-", "_")}_id"}
+            )
     else:
-        return_list = return_list.rename(columns={"id": f"{collection.replace("-", "_")}_id"})
+        return_list = return_list.rename(
+            columns={"id": f"{collection.replace("-", "_")}_id"}
+            )
 
     # Create metadata object from response
     metadata = BaseMetadata(response)

From 2131316629744242863c0d13648a494c78f8e4da Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Mon, 22 Dec 2025 13:33:47 -0600
Subject: [PATCH 5/7] wrong rocks

---
 dataretrieval/waterdata/api.py   | 4 ++--
 dataretrieval/waterdata/utils.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
index 3d714c6c..59cc5a17 100644
--- a/dataretrieval/waterdata/api.py
+++ b/dataretrieval/waterdata/api.py
@@ -1439,11 +1439,11 @@ def get_reference_table(
     # Give ID column a more meaningful name
     if collection.endswith("s"):
         return_list = return_list.rename(
-            columns={"id": f"{collection[:-1].replace("-", "_")}_id"}
+            columns={"id": f"{collection[:-1].replace('-', '_')}_id"}
             )
     else:
         return_list = return_list.rename(
-            columns={"id": f"{collection.replace("-", "_")}_id"}
+            columns={"id": f"{collection.replace('-', '_')}_id"}
             )
 
     # Create metadata object from response
diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index 0d0703a5..b143528b 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -674,7 +674,7 @@ def _arrange_cols(
     # If the arbitrary id column is returned (either due to properties
     # being none or NaN), then move it to the end of the dataframe, but
     # if part of properties, keep in requested order
-    if extra_id_col and properties is None or all(pd.isna(properties)):
+    if extra_id_col and (properties is None or all(pd.isna(properties))):
         id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col)
         df = df.loc[:, id_col_order]
     

From 171aefcda6419fc245541a6351ebd2f11df1088b Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Mon, 22 Dec 2025 13:54:04 -0600
Subject: [PATCH 6/7] add some more rigorous column ordering tests

---
 tests/waterdata_test.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index 353e41fa..12c1662e 100755
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -140,11 +140,20 @@ def test_get_daily_properties():
         time="2025-01-01/..",
         properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"]
     )
-    assert "daily_id" in df.columns
-    assert "geometry" in df.columns
+    assert "daily_id" == df.columns[0]
+    assert "geometry" == df.columns[-1]
     assert df.shape[1] == 6
     assert df.parameter_code.unique().tolist() == ["00060"]
 
+def test_get_daily_properties_id():
+    df,_ = get_daily(
+        monitoring_location_id="USGS-05427718",
+        parameter_code="00060",
+        time="2025-01-01/..",
+        properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"]
+    )
+    assert "daily_id" == df.columns[1]
+
 def test_get_daily_no_geometry():
     df,_ = get_daily(
         monitoring_location_id="USGS-05427718",
@@ -188,7 +197,7 @@ def test_get_latest_continuous():
         monitoring_location_id=["USGS-05427718", "USGS-05427719"],
         parameter_code=["00060", "00065"]
     )
-    assert "latest_continuous_id" in df.columns
+    assert "latest_continuous_id" == df.columns[-1]
     assert df.shape[0] <= 4
     assert df.statistic_id.unique().tolist() == ["00011"]
     assert hasattr(md, 'url')

From 9236209cfe28e46f89d8d320dfeca4b5e98f2abe Mon Sep 17 00:00:00 2001
From: Elise Hinman <ehinman@usgs.gov>
Date: Tue, 23 Dec 2025 12:41:44 -0600
Subject: [PATCH 7/7] add back in geometry when skip_geometry is false but
 properties do not contain geometry

---
 dataretrieval/waterdata/utils.py | 4 ++++
 tests/waterdata_test.py          | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py
index b143528b..582491ae 100644
--- a/dataretrieval/waterdata/utils.py
+++ b/dataretrieval/waterdata/utils.py
@@ -653,7 +653,11 @@ def _arrange_cols(
     df = df.rename(columns={"id": output_id})
 
     # If properties are provided, filter to only those columns
+    # plus geometry if skip_geometry is False
     if properties and not all(pd.isna(properties)):
+        # Make sure geometry stays in the dataframe if skip_geometry is False
+        if 'geometry' in df.columns and 'geometry' not in properties:
+            properties.append('geometry')
         # id is technically a valid column from the service, but these
         # functions make the name more specific. So, if someone requests
         # 'id', give them the output_id column
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
index 12c1662e..abdd823b 100755
--- a/tests/waterdata_test.py
+++ b/tests/waterdata_test.py
@@ -214,6 +214,15 @@ def test_get_latest_daily():
     assert hasattr(md, 'url')
     assert hasattr(md, 'query_time')
 
+def test_get_latest_daily_properties_geometry():
+    df, md = get_latest_daily(
+        monitoring_location_id=["USGS-05427718", "USGS-05427719"],
+        parameter_code=["00060", "00065"],
+        properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure']
+    )
+    assert "geometry" in df.columns
+    assert df.shape[1] == 6
+
 def test_get_field_measurements():
     df, md = get_field_measurements(
         monitoring_location_id="USGS-05427718",