diff --git a/dataretrieval/waterdata/__init__.py b/dataretrieval/waterdata/__init__.py index 39b758f..3065958 100644 --- a/dataretrieval/waterdata/__init__.py +++ b/dataretrieval/waterdata/__init__.py @@ -19,6 +19,7 @@ get_latest_continuous, get_latest_daily, get_monitoring_locations, + get_reference_table, get_samples, get_time_series_metadata, ) @@ -37,6 +38,7 @@ "get_latest_continuous", "get_latest_daily", "get_monitoring_locations", + "get_reference_table", "get_samples", "get_time_series_metadata", "_check_profiles", diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index 63f7b81..59cc5a1 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -16,11 +16,17 @@ from dataretrieval.utils import BaseMetadata, to_str from dataretrieval.waterdata.types import ( CODE_SERVICES, + METADATA_COLLECTIONS, PROFILE_LOOKUP, PROFILES, SERVICES, ) -from dataretrieval.waterdata.utils import SAMPLES_URL, get_ogc_data +from dataretrieval.waterdata.utils import ( + SAMPLES_URL, + get_ogc_data, + _construct_api_requests, + _walk_pages +) # Set up logger for this module logger = logging.getLogger(__name__) @@ -1388,6 +1394,62 @@ def get_field_measurements( return get_ogc_data(args, output_id, service) +def get_reference_table( + collection: str, + limit: Optional[int] = None, + ) -> Tuple[pd.DataFrame, BaseMetadata]: + """Get metadata reference tables for the USGS Water Data API. + + Reference tables provide the range of allowable values for parameter + arguments in the waterdata module. + + Parameters + ---------- + collection : string + One of the following options: "agency-codes", "altitude-datums", + "aquifer-codes", "aquifer-types", "coordinate-accuracy-codes", + "coordinate-datum-codes", "coordinate-method-codes", "counties", + "hydrologic-unit-codes", "medium-codes", "national-aquifer-codes", + "parameter-codes", "reliability-codes", "site-types", "states", + "statistic-codes", "topographic-codes", "time-zone-codes" + limit : numeric, optional + The optional limit parameter is used to control the subset of the + selected features that should be returned in each page. The maximum + allowable limit is 50000. It may be beneficial to set this number lower + if your internet connection is spotty. The default (None) will set the + limit to the maximum allowable limit for the service. + """ + valid_code_services = get_args(METADATA_COLLECTIONS) + if collection not in valid_code_services: + raise ValueError( + f"Invalid code service: '{collection}'. " + f"Valid options are: {valid_code_services}." + ) + + req = _construct_api_requests( + service=collection, + limit=limit, + skip_geometry=True, + ) + # Run API request and iterate through pages if needed + return_list, response = _walk_pages( + geopd=False, req=req + ) + + # Give ID column a more meaningful name + if collection.endswith("s"): + return_list = return_list.rename( + columns={"id": f"{collection[:-1].replace('-', '_')}_id"} + ) + else: + return_list = return_list.rename( + columns={"id": f"{collection.replace('-', '_')}_id"} + ) + + # Create metadata object from response + metadata = BaseMetadata(response) + return return_list, metadata + def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame: """Return codes from a Samples code service. diff --git a/dataretrieval/waterdata/types.py b/dataretrieval/waterdata/types.py index 65e7339..f5e1496 100644 --- a/dataretrieval/waterdata/types.py +++ b/dataretrieval/waterdata/types.py @@ -11,6 +11,27 @@ "states", ] +METADATA_COLLECTIONS = Literal[ + "agency-codes", + "altitude-datums", + "aquifer-codes", + "aquifer-types", + "coordinate-accuracy-codes", + "coordinate-datum-codes", + "coordinate-method-codes", + "counties", + "hydrologic-unit-codes", + "medium-codes", + "national-aquifer-codes", + "parameter-codes", + "reliability-codes", + "site-types", + "states", + "statistic-codes", + "topographic-codes", + "time-zone-codes", +] + SERVICES = Literal[ "activities", "locations", diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 46d58b6..582491a 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -547,7 +547,7 @@ def _walk_pages( logger.info("Requesting: %s", req.url) if not geopd: - logger.warning( + logger.info( "Geopandas not installed. Geometries will be flattened into pandas DataFrames." ) @@ -648,35 +648,38 @@ def _arrange_cols( pd.DataFrame or gpd.GeoDataFrame The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. """ + + # Rename id column to output_id + df = df.rename(columns={"id": output_id}) + + # If properties are provided, filter to only those columns + # plus geometry if skip_geometry is False if properties and not all(pd.isna(properties)): - if "id" not in properties: - # If user refers to service-specific output id in properties, - # then rename the "id" column to the output_id (id column is - # automatically included). - if output_id in properties: - df = df.rename(columns={"id": output_id}) - # If output id is not in properties, but user requests the plural - # of the output_id (e.g. "monitoring_locations_id"), then rename - # "id" to plural. This is pretty niche. - else: - plural = output_id.replace("_id", "s_id") - if plural in properties: - df = df.rename(columns={"id": plural}) + # Make sure geometry stays in the dataframe if skip_geometry is False + if 'geometry' in df.columns and 'geometry' not in properties: + properties.append('geometry') + # id is technically a valid column from the service, but these + # functions make the name more specific. So, if someone requests + # 'id', give them the output_id column + if 'id' in properties: + properties[properties.index('id')] = output_id df = df.loc[:, [col for col in properties if col in df.columns]] - else: - df = df.rename(columns={"id": output_id}) - + # Move meaningless-to-user, extra id columns to the end # of the dataframe, if they exist - extra_id_cols = set(df.columns).intersection({ + extra_id_col = set(df.columns).intersection({ "latest_continuous_id", "latest_daily_id", "daily_id", "continuous_id", "field_measurement_id" }) - if extra_id_cols: - id_col_order = [col for col in df.columns if col not in extra_id_cols] + list(extra_id_cols) + + # If the arbitrary id column is returned (either due to properties + # being none or NaN), then move it to the end of the dataframe, but + # if part of properties, keep in requested order + if extra_id_col and (properties is None or all(pd.isna(properties))): + id_col_order = [col for col in df.columns if col not in extra_id_col] + list(extra_id_col) df = df.loc[:, id_col_order] return df diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 096a50a..abdd823 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -16,6 +16,7 @@ get_latest_daily, get_field_measurements, get_time_series_metadata, + get_reference_table ) def mock_request(requests_mock, request_url, file_path): @@ -139,11 +140,20 @@ def test_get_daily_properties(): time="2025-01-01/..", properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"] ) - assert "daily_id" in df.columns - assert "geometry" in df.columns + assert "daily_id" == df.columns[0] + assert "geometry" == df.columns[-1] assert df.shape[1] == 6 assert df.parameter_code.unique().tolist() == ["00060"] +def test_get_daily_properties_id(): + df,_ = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + properties=["monitoring_location_id", "id", "parameter_code", "time", "value", "geometry"] + ) + assert "daily_id" == df.columns[1] + def test_get_daily_no_geometry(): df,_ = get_daily( monitoring_location_id="USGS-05427718", @@ -187,7 +197,7 @@ def test_get_latest_continuous(): monitoring_location_id=["USGS-05427718", "USGS-05427719"], parameter_code=["00060", "00065"] ) - assert "latest_continuous_id" in df.columns + assert "latest_continuous_id" == df.columns[-1] assert df.shape[0] <= 4 assert df.statistic_id.unique().tolist() == ["00011"] assert hasattr(md, 'url') @@ -204,6 +214,15 @@ def test_get_latest_daily(): assert hasattr(md, 'url') assert hasattr(md, 'query_time') +def test_get_latest_daily_properties_geometry(): + df, md = get_latest_daily( + monitoring_location_id=["USGS-05427718", "USGS-05427719"], + parameter_code=["00060", "00065"], + properties=['monitoring_location_id', 'parameter_code', 'time', 'value', 'unit_of_measure'] + ) + assert "geometry" in df.columns + assert df.shape[1] == 6 + def test_get_field_measurements(): df, md = get_field_measurements( monitoring_location_id="USGS-05427718", @@ -227,4 +246,14 @@ def test_get_time_series_metadata(): assert hasattr(md, 'url') assert hasattr(md, 'query_time') +def test_get_reference_table(): + df, md = get_reference_table("agency-codes") + assert "agency_code_id" in df.columns + assert df.shape[0] > 0 + assert hasattr(md, 'url') + assert hasattr(md, 'query_time') + +def test_get_reference_table_wrong_name(): + with pytest.raises(ValueError): + get_reference_table("agency-cod")