From 2a837c91a60ee3dde03354ae215c36afbf3de0a3 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Dec 2025 17:24:17 -0600 Subject: [PATCH 1/7] add reference tables function --- dataretrieval/waterdata/__init__.py | 2 + dataretrieval/waterdata/api.py | 60 ++++++++++++++++++++++++++++- dataretrieval/waterdata/types.py | 21 ++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 39b758f7..30659580 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -19,6 +19,7 @@ get_latest_continuous, get_latest_daily, get_monitoring_locations, + get_reference_table, get_samples, get_time_series_metadata, ) @@ -37,6 +38,7 @@ "get_latest_continuous", "get_latest_daily", "get_monitoring_locations", + "get_reference_table", "get_samples", "get_time_series_metadata", "_check_profiles", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 63f7b819..26f0b45e 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,11 +16,17 @@ from dataretrieval.utils import BaseMetadata, to_str from dataretrieval.waterdata.types import ( CODE_SERVICES, + METADATA_COLLECTIONS, PROFILE_LOOKUP, PROFILES, SERVICES, ) -from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data +from dataretrieval.waterdata.utils import ( + SAMPLES_URL, + get_ogc_data, + _construct_api_requests, + _walk_pages +) # Set up logger for this module logger = logging.getLogger(__name__) @@ -1388,6 +1394,58 @@ def get_field_measurements( return get_ogc_data(args, output_id, service) +def get_reference_table( + collection: str, + limit: Optional[int] = None, + ) -> Tuple[pd.DataFrame, BaseMetadata]: + """Get metadata reference tables for the USGS Water Data API. + + Reference tables provide the range of allowable values for parameter + arguments in the waterdata module. + + Parameters + ---------- + collection : string + One of the following options: "agency-codes", "altitude-datums", + "aquifer-codes", "aquifer-types", "coordinate-accuracy-codes", + "coordinate-datum-codes", "coordinate-method-codes", "counties", + "hydrologic-unit-codes", "medium-codes", "national-aquifer-codes", + "parameter-codes", "reliability-codes", "site-types", "states", + "statistic-codes", "topographic-codes", "time-zone-codes" + limit : numeric, optional + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 50000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. + """ + valid_code_services = get_args(METADATA_COLLECTIONS) + if collection not in valid_code_services: + raise ValueError( + f"Invalid code service: '{collection}'. " + f"Valid options are: {valid_code_services}." + ) + + req = _construct_api_requests( + service=collection, + limit=limit, + skip_geometry=True, + ) + # Run API request and iterate through pages if needed + return_list, response = _walk_pages( + geopd=False, req=req + ) + + # Give ID column a more meaningful name + if collection.endswith("s"): + return_list = return_list.rename(columns={"id": f"{collection[:-1].replace("-", "_")}_id"}) + else: + return_list = return_list.rename(columns={"id": f"{collection.replace("-", "_")}_id"}) + + # Create metadata object from response + metadata = BaseMetadata(response) + return return_list, metadata + def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame: """Return codes from a Samples code service. diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index 65e73394..f5e1496b 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -11,6 +11,27 @@ "states", ] +METADATA_COLLECTIONS = Literal[ + "agency-codes", + "altitude-datums", + "aquifer-codes", + "aquifer-types", + "coordinate-accuracy-codes", + "coordinate-datum-codes", + "coordinate-method-codes", + "counties", + "hydrologic-unit-codes", + "medium-codes", + "national-aquifer-codes", + "parameter-codes", + "reliability-codes", + "site-types", + "states", + "statistic-codes", + "topographic-codes", + "time-zone-codes", +] + SERVICES = Literal[ "activities", "locations", From 5b42f0d17f248ccbb4ed99915ba710ee128a131e Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Dec 2025 10:00:23 -0600 Subject: [PATCH 2/7] add unit tests for ref tables and change warning to info --- dataretrieval/waterdata/utils.py | 2 +- tests/waterdata_test.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 46d58b62..4ad8f1c8 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -547,7 +547,7 @@ def _walk_pages( logger.info("Requesting: %s", req.url) if not geopd: - logger.warning( + logger.info( "Geopandas not installed. Geometries will be flattened into pandas DataFrames." ) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 096a50ae..353e41fa 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -16,6 +16,7 @@ get_latest_daily, get_field_measurements, get_time_series_metadata, + get_reference_table ) def mock_request(requests_mock, request_url, file_path): @@ -227,4 +228,14 @@ def test_get_time_series_metadata(): assert hasattr(md, 'url') assert hasattr(md, 'query_time') +def test_get_reference_table(): + df, md = get_reference_table("agency-codes") + assert "agency_code_id" in df.columns + assert df.shape[0] > 0 + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') + +def test_get_reference_table_wrong_name(): + with pytest.raises(ValueError): + get_reference_table("agency-cod") From cf0f4d734762451ccdae28d98d3267373f7686dc Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Dec 2025 11:05:13 -0600 Subject: [PATCH 3/7] change column ordering logic a little in case someone requests id --- dataretrieval/waterdata/utils.py | 37 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 4ad8f1c8..0d0703a5 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -648,35 +648,34 @@ def _arrange_cols( pd.DataFrame or gpd.GeoDataFrame The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. """ + + # Rename id column to output_id + df = df.rename(columns={"id": output_id}) + + # If properties are provided, filter to only those columns if properties and not all(pd.isna(properties)): - if "id" not in properties: - # If user refers to service-specific output id in properties, - # then rename the "id" column to the output_id (id column is - # automatically included). - if output_id in properties: - df = df.rename(columns={"id": output_id}) - # If output id is not in properties, but user requests the plural - # of the output_id (e.g. "monitoring_locations_id"), then rename - # "id" to plural. This is pretty niche. - else: - plural = output_id.replace("_id", "s_id") - if plural in properties: - df = df.rename(columns={"id": plural}) + # id is technically a valid column from the service, but these + # functions make the name more specific. So, if someone requests + # 'id', give them the output_id column + if 'id' in properties: + properties[properties.index('id')] = output_id df = df.loc[:, [col for col in properties if col in df.columns]] - else: - df = df.rename(columns={"id": output_id}) - + # Move meaningless-to-user, extra id columns to the end # of the dataframe, if they exist - extra_id_cols = set(df.columns).intersection({ + extra_id_col = set(df.columns).intersection({ "latest_continuous_id", "latest_daily_id", "daily_id", "continuous_id", "field_measurement_id" }) - if extra_id_cols: - id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols) + + # If the arbitrary id column is returned (either due to properties + # being none or NaN), then move it to the end of the dataframe, but + # if part of properties, keep in requested order + if extra_id_col and properties is None or all(pd.isna(properties)): + id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col) df = df.loc[:, id_col_order] return df From eb2b5d3270bc6f2db65c54a65b320d56dc72dc4a Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Dec 2025 12:49:47 -0600 Subject: [PATCH 4/7] fix flake8 --- dataretrieval/waterdata/api.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 26f0b45e..3d714c6c 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -1438,9 +1438,13 @@ def get_reference_table( # Give ID column a more meaningful name if collection.endswith("s"): - return_list = return_list.rename(columns={"id": f"{collection[:-1].replace("-", "_")}_id"}) + return_list = return_list.rename( + columns={"id": f"{collection[:-1].replace("-", "_")}_id"} + ) else: - return_list = return_list.rename(columns={"id": f"{collection.replace("-", "_")}_id"}) + return_list = return_list.rename( + columns={"id": f"{collection.replace("-", "_")}_id"} + ) # Create metadata object from response metadata = BaseMetadata(response) From 2131316629744242863c0d13648a494c78f8e4da Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Dec 2025 13:33:47 -0600 Subject: [PATCH 5/7] wrong rocks --- dataretrieval/waterdata/api.py | 4 ++-- dataretrieval/waterdata/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 3d714c6c..59cc5a17 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -1439,11 +1439,11 @@ def get_reference_table( # Give ID column a more meaningful name if collection.endswith("s"): return_list = return_list.rename( - columns={"id": f"{collection[:-1].replace("-", "_")}_id"} + columns={"id": f"{collection[:-1].replace('-', '_')}_id"} ) else: return_list = return_list.rename( - columns={"id": f"{collection.replace("-", "_")}_id"} + columns={"id": f"{collection.replace('-', '_')}_id"} ) # Create metadata object from response diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 0d0703a5..b143528b 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -674,7 +674,7 @@ def _arrange_cols( # If the arbitrary id column is returned (either due to properties # being none or NaN), then move it to the end of the dataframe, but # if part of properties, keep in requested order - if extra_id_col and properties is None or all(pd.isna(properties)): + if extra_id_col and (properties is None or all(pd.isna(properties))): id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col) df = df.loc[:, id_col_order] From 171aefcda6419fc245541a6351ebd2f11df1088b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Dec 2025 13:54:04 -0600 Subject: [PATCH 6/7] add some more rigorous column ordering tests --- tests/waterdata_test.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 353e41fa..12c1662e 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -140,11 +140,20 @@ def test_get_daily_properties(): time="2025-01-01/..", properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"] ) - assert "daily_id" in df.columns - assert "geometry" in df.columns + assert "daily_id" == df.columns[0] + assert "geometry" == df.columns[-1] assert df.shape[1] == 6 assert df.parameter_code.unique().tolist() == ["00060"] +def test_get_daily_properties_id(): + df,_ = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"] + ) + assert "daily_id" == df.columns[1] + def test_get_daily_no_geometry(): df,_ = get_daily( monitoring_location_id="USGS-05427718", @@ -188,7 +197,7 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"] ) - assert "latest_continuous_id" in df.columns + assert "latest_continuous_id" == df.columns[-1] assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, 'url') From 9236209cfe28e46f89d8d320dfeca4b5e98f2abe Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Tue, 23 Dec 2025 12:41:44 -0600 Subject: [PATCH 7/7] add back in geometry when skip_geometry is false but properties do not contain geometry --- dataretrieval/waterdata/utils.py | 4 ++++ tests/waterdata_test.py | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index b143528b..582491ae 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -653,7 +653,11 @@ def _arrange_cols( df = df.rename(columns={"id": output_id}) # If properties are provided, filter to only those columns + # plus geometry if skip_geometry is False if properties and not all(pd.isna(properties)): + # Make sure geometry stays in the dataframe if skip_geometry is False + if 'geometry' in df.columns and 'geometry' not in properties: + properties.append('geometry') # id is technically a valid column from the service, but these # functions make the name more specific. So, if someone requests # 'id', give them the output_id column diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 12c1662e..abdd823b 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -214,6 +214,15 @@ def test_get_latest_daily(): assert hasattr(md, 'url') assert hasattr(md, 'query_time') +def test_get_latest_daily_properties_geometry(): + df, md = get_latest_daily( + monitoring_location_id=["USGS-05427718", "USGS-05427719"], + parameter_code=["00060", "00065"], + properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure'] + ) + assert "geometry" in df.columns + assert df.shape[1] == 6 + def test_get_field_measurements(): df, md = get_field_measurements( monitoring_location_id="USGS-05427718",