diff --git a/docker-compose.yaml b/docker-compose.yaml index ee33e7f..a1891f7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -20,7 +20,6 @@ services: env_file: - .env volumes: - - ./climate-python:/jupyter/climate-python - - ./mimi-api:/jupyter/mimi-api + - ./src/beaker_climate:/jupyter/beaker_climate working_dir: /jupyter command: ["beaker", "dev", "watch", "--ip", "0.0.0.0"] diff --git a/pyproject.toml b/pyproject.toml index 7a11cb3..46fe3e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "h5netcdf~=1.3.0", "netcdf4~=1.6.5", "cftime~=1.6.3", - "esgf-pyclient~=0.3.1", + "dask", ] [project.urls] diff --git a/src/beaker_climate/climate_python/__init__.py b/src/beaker_climate/beaker_climate/__init__.py similarity index 100% rename from src/beaker_climate/climate_python/__init__.py rename to src/beaker_climate/beaker_climate/__init__.py diff --git a/src/beaker_climate/beaker_climate/agent.py b/src/beaker_climate/beaker_climate/agent.py new file mode 100644 index 0000000..1408227 --- /dev/null +++ b/src/beaker_climate/beaker_climate/agent.py @@ -0,0 +1,205 @@ +import json +import logging +import re +from typing import Optional +import codecs + +import pandas +import matplotlib.pyplot as plt +import xarray as xr + +from archytas.react import Undefined +from archytas.tool_utils import AgentRef, LoopControllerRef, ReactContextRef, tool + +from beaker_kernel.lib import BeakerAgent +from beaker_kernel.lib.context import BaseContext + +from pathlib import Path + +logger = logging.getLogger(__name__) + +from time import sleep +from .search.esgf_search import ESGFProvider + +class ClimateDataUtilityAgent(BeakerAgent): + """ + You are assisting us in modifying geo-temporal datasets. + + The main things you are going to do are regridding spatial datasets, temporally rescaling datasets, and clipping the extent of geo-temporal datasets. + + If you don't have the details necessary to use a tool, you should use the ask_user tool to ask the user for them. + """ + def __init__(self, context: BaseContext = None, tools: list = None, **kwargs): + self.logger = logger + super().__init__(context, tools, **kwargs) + + documentation_path=Path(__file__).parent / "api_documentation" / "climate_search.md" + initial_context_msg_added = False + while not initial_context_msg_added: + with open(documentation_path, 'r') as f: + try: + self.add_context(f'''\ + The Earth System Grid Federation (ESGF) is a global collaboration that manages and distributes climate and environmental science data. + It serves as the primary platform for accessing CMIP (Coupled Model Intercomparison Project) data and other climate model outputs. + The federation provides a distributed database and delivery system for climate science data, particularly model outputs and observational data. + Through ESGF, users can search, discover and access climate datasets from major modeling centers and research institutions worldwide. + The system supports authentication, search capabilities, and data transfer protocols optimized for large scientific datasets. + + If datasets are loaded, use xarray with the OpenDAP URL. + If the user asks to download a dataset, ask them if they are sure they want to download it. + + Additionally, any data downloaded should be downloaded to the './data/' directory. + Please ensure the code makes sure this location exists, and all downloaded data is saved to this location. + + Provided below is the comprehensive documentation of the climate-search tools that you have access to. + ALWAYS reference this when using the climate-search tools. + ``` + {f.read()} + ``` +''') + initial_context_msg_added = True + except Exception as e: + sleep(0.5) + self.esgf = ESGFProvider(self.oneshot) + + + @tool() + async def search(self, query: str, agent: AgentRef, loop: LoopControllerRef, react_context: ReactContextRef) -> dict: + """ + This tool searches ESGF for datasets. + Save the UNMODIFIED JSON output to a variable in the user's notebook. + + Args: + query (str): The user's query to pass to the climate search tool. + Returns: + dict: ESGF unmodified JSON output to be saved to a variable in the notebook. + """ + try: + return await self.esgf.tool_search(query) + except Exception as e: + self.add_context(f"The tool failed with this error: {str(e)}. I need to inform the user about this immediately before deciding what to do next. I need to tell the user the exact error with zero summarization.") + return {} + + + @tool() + async def fetch(self, dataset_id: str, agent: AgentRef, loop: LoopControllerRef, react_context: ReactContextRef) -> dict: + """ + This tool fetches URLS for datasets. + + Args: + dataset_id (str): The user's query to pass to the climate search tool. + Returns: + dict: ESGF fetch results + """ + try: + return self.esgf.tool_fetch(dataset_id) + except Exception as e: + self.add_context(f"The tool failed with this error: {str(e)}. I should inform the user immediately with the full text of the error.") + return {} + + @tool() + async def regrid_dataset( + self, + dataset: str, + target_resolution: tuple, + agent: AgentRef, + loop: LoopControllerRef, + aggregation: Optional[str] = "interp_or_mean", + ) -> str: + """ + This tool should be used to show the user code to regrid a netcdf dataset with detectable geo-resolution. + + If a user asks to regrid a dataset, use this tool to return them code to regrid the dataset. + + If you are given a netcdf dataset, use this tool instead of any other regridding tool. + + If you are asked about what is needed to regrid a dataset, please provide information about the arguments of this tool. + + Args: + dataset (str): The name of the dataset instantiated in the jupyter notebook. + target_resolution (tuple): The target resolution to regrid to, e.g. (0.5, 0.5). This is in degrees longitude and latitude. + aggregation (Optional): The aggregation function to be used in the regridding. The options are as follows: + 'conserve' + 'min' + 'max' + 'mean' + 'median' + 'mode' + 'interp_or_mean' + 'nearest_or_mode' + + Returns: + str: Status of whether or not the dataset has been persisted to the HMI server. + """ + + loop.set_state(loop.STOP_SUCCESS) + code = agent.context.get_code( + "flowcast_regridding", + { + "dataset": dataset, + "target_resolution": target_resolution, + "aggregation": aggregation, + }, + ) + + result = json.dumps( + { + "action": "code_cell", + "language": "python3", + "content": code.strip(), + } + ) + + return result + + @tool() + async def get_netcdf_plot( + self, + dataset_variable_name: str, + agent: AgentRef, + loop: LoopControllerRef, + plot_variable_name: Optional[str] = None, + lat_col: Optional[str] = "lat", + lon_col: Optional[str] = "lon", + time_slice_index: Optional[int] = 1, + ) -> str: + """ + This function should be used to get a plot of a netcdf dataset. + + This function should also be used to preview any netcdf dataset. + + If the user asks to plot or preview a dataset, use this tool to return plotting code to them. + + You should also ask if the user wants to specify the optional arguments by telling them what each argument does. + + Args: + dataset_variable_name (str): The name of the dataset instantiated in the jupyter notebook. + plot_variable_name (Optional): The name of the variable to plot. Defaults to None. + If None is provided, the first variable in the dataset will be plotted. + lat_col (Optional): The name of the latitude column. Defaults to 'lat'. + lon_col (Optional): The name of the longitude column. Defaults to 'lon'. + time_slice_index (Optional): The index of the time slice to visualize. Defaults to 1. + + Returns: + str: The code used to plot the netcdf. + """ + + code = agent.context.get_code( + "get_netcdf_plot", + { + "dataset": dataset_variable_name, + "plot_variable_name": plot_variable_name, + "lat_col": lat_col, + "lon_col": lon_col, + "time_slice_index": time_slice_index, + }, + ) + + result = await agent.context.evaluate( + code, + parent_header={}, + ) + + output = result.get("return") + + return output diff --git a/src/beaker_climate/beaker_climate/api_documentation/climate_search.md b/src/beaker_climate/beaker_climate/api_documentation/climate_search.md new file mode 100644 index 0000000..31265de --- /dev/null +++ b/src/beaker_climate/beaker_climate/api_documentation/climate_search.md @@ -0,0 +1,87 @@ +# climate-data + +On first context launch, caching data for search will be created - this may take around a minute. + +## Structure + +The way this set of tools works follows a very specific workflow. +* Search for datasets with the `search` tool. +* Use the **dataset ID** and no other fields to pass to the `fetch` tool to download. +* Fetch will give URLs to download from; if you are downloading a dataset for the user, use the HTTP protocol set of URLs returned from the `fetch` tool. + +Do not use the tools in other ways than this. + +The `search` tool will return a JSON payload. Inside the response body, `"results"` is a list containing dataset metadata bundles. + +To fetch one of them, a list entry in `"results"` will have a `"metadata"` field containing an `"id"` field. **The `"id"` field is what should be passed to the `fetch` tool to download a file. + +### CMIP6 (ESGF) + +By default, climate-data will search all possible given mirrors for reliability - for endpoints, IDs with mirrors associated in the following form: (`CMIP6.CMIP.NCAR.CESM2.historical.r11i1p1f1.CFday.ua.gn.v20190514|esgf-data.ucar.edu`) should be considered **interchangeable** with mirrorless versions (`CMIP6.CMIP.NCAR.CESM2.historical.r11i1p1f1.CFday.ua.gn.v20190514`). Mirrorless versions should be considered the preferred form. + +#### Search Tool + +Required Parameters: +* `query`: Natural language string with search terms to retrieve datasets for. + +Example: `/search/esgf?query=historical eastward wind 100 km cesm2 r11i1p1f1 cfday` + +Output: +```json +{ + "results": [ + { + "metadata": { + "id": "CMIP6.CMIP.NCAR.CESM2.historical.r11i1p1f1.CFday.ua.gn.v20190514|aims3.llnl.gov", + "version": "20190514"... + } + }, ... + ] +} +``` + +`results` is a list of datasets, sorted by relevance. + +Each dataset contains a `metadata` field. + +`metadata` contains all of the stored metadata for the data set, provided by ESGF, such as experiment name, title, variables, geospatial coordinates, time, frequency, resolution, and more. + +The filesize in bytes of the dataset is in the `size` field of the metadata. Listing metadata attributes about datasets to the user is very useful. Convert sizes to human readable values such as MB or GB, as well as when asked to describe the dataset, mention coordinates, frequency, and resolution as important details. + +**If the user asks for information, mention filesize in human readable units, frequency, resolution, and variable. Summarize the metadata, DO NOT print it to stdout.** + +The `metadata` field contains an `id` field that is used for subsequent processing and lookups, containing the full dataset ID with revision and node information, such as: `CMIP6.CMIP.NCAR.CESM2.historical.r11i1p1f1.CFday.ua.gn.v20190514|esgf-data.ucar.edu` + +#### Fetch Tool + +Required Parameters: +* `dataset_id`: ID of the dataset provided by search in full format. + +Example: +`/fetch/esgf?dataset_id=CMIP6.CMIP.NCAR.CESM2.historical.r11i1p1f1.CFday.ua.gn.v20190514|esgf-data.ucar.edu` + +Output: +```json +{ + "dataset": "CMIP6.CMIP....", + "urls": [ + { + "http": [ + "http://esgf-data.node.example/http/part1...", + "http://esgf-data.node.example/http/part2..." + ], + "opendap": [ + "http://esgf-data.node.example/opendap/part1...", + "http://esgf-data.node.example/openda[/part2..." + ] + }, + ], + "metadata": {} +} +``` + +The `urls` field returns a list of dicts mapping **protocol** to **a list of URLs** that comprise the download for each dataset. These files may be large, so they may be one singular download url or multipart, with multiple URLs. + +HTTP urls are provided for plain downloads. + +OpenDAP supports `xarray.open_mfdataset()` for lazy network usage and disk usage. diff --git a/src/beaker_climate/beaker_climate/context.py b/src/beaker_climate/beaker_climate/context.py new file mode 100644 index 0000000..d468e40 --- /dev/null +++ b/src/beaker_climate/beaker_climate/context.py @@ -0,0 +1,22 @@ +import logging +import os +from typing import TYPE_CHECKING, Any, Dict + +from archytas.tool_utils import LoopControllerRef + +from beaker_kernel.lib import BeakerContext +from beaker_kernel.lib.utils import intercept + +from .agent import ClimateDataUtilityAgent + +if TYPE_CHECKING: + from beaker_kernel.lib import BeakerContext + +logger = logging.getLogger(__name__) + +class ClimateDataUtilityContext(BeakerContext): + compatible_subkernels = ["python3"] + SLUG = "beaker_climate" + + def __init__(self, beaker_kernel: "BeakerKernel", config: Dict[str, Any]) -> None: + super().__init__(beaker_kernel, ClimateDataUtilityAgent, config) diff --git a/src/beaker_climate/climate_python/procedures/python3/flowcast_regridding.py b/src/beaker_climate/beaker_climate/procedures/python3/flowcast_regridding.py similarity index 100% rename from src/beaker_climate/climate_python/procedures/python3/flowcast_regridding.py rename to src/beaker_climate/beaker_climate/procedures/python3/flowcast_regridding.py diff --git a/src/beaker_climate/climate_python/procedures/python3/get_netcdf_plot.py b/src/beaker_climate/beaker_climate/procedures/python3/get_netcdf_plot.py similarity index 100% rename from src/beaker_climate/climate_python/procedures/python3/get_netcdf_plot.py rename to src/beaker_climate/beaker_climate/procedures/python3/get_netcdf_plot.py diff --git a/src/beaker_climate/beaker_climate/search/esgf_search.py b/src/beaker_climate/beaker_climate/search/esgf_search.py new file mode 100644 index 0000000..f943c4d --- /dev/null +++ b/src/beaker_climate/beaker_climate/search/esgf_search.py @@ -0,0 +1,388 @@ +import itertools +import json +import os +import re +from dataclasses import dataclass +from typing import Any +from urllib.parse import urlencode + +import dask +import requests + +DatasetSearchResults = list[dict[str, Any]] +AccessURLs = list[dict[str, list[str]]] # mirrors : [ method -> urls ] + + +DEFAULT_ESGF_FALLBACKS = [ + "https://esgf-node.ornl.gov/esg-search", + "https://ds.nccs.nasa.gov/esg-search", + "https://dpesgf03.nccs.nasa.gov/esg-search", + "https://esg-dn1.nsc.liu.se/esg-search", + "https://esg-dn2.nsc.liu.se/esg-search", + "https://esg-dn3.nsc.liu.se/esg-search", + "https://cmip.bcc.cma.cn/esg-search", + "http://cmip.fio.org.cn/esg-search", + "http://cordexesg.dmi.dk/esg-search", + "http://data.meteo.unican.es/esg-search", + "http://esg-cccr.tropmet.res.in/esg-search", +] + +@dataclass +class Settings: + esgf_url = os.environ.get("ESGF_URL", "https://esgf-node.llnl.gov/esg-search") + esgf_fallbacks = os.environ.get("ESGF_FALLBACKS", ",".join(DEFAULT_ESGF_FALLBACKS)) + default_facets = "project,experiment_family" + entries_per_page = 20 + +default_settings = Settings() + + +def generate_natural_language_system_prompt(facets: dict[str, list[str]]) -> str: + return f"""\ +You are an assistant trying to help a user determine which variables, sources, experiments, resolutions, +variants, institutions, and frequencies from ESGF's CMIP6 are being referenced in their natural language query. + +Here is a list of variable_descriptions: {facets['variable_long_name']} +Here is a list of variables: {facets['variable_id']} +Here is a list of source_ids: {facets['source_id']} +Here is a list of experiment_ids: {facets['experiment_id']} +Here is a list of nominal_resolutions: {facets['nominal_resolution']} +Here is a list of institution_ids: {facets['institution_id']} +Here is a list of variant_labels: {facets['variant_label']} +Here is a list of frequencies: {facets['frequency']} + +You should respond by building a dictionary that has the following keys: + [variable_descriptions, variable, source_id, experiment_id, nominal_resolution, institution_id, variant_label, frequency] + +Please select up to three variable_descriptions from the variable_descriptions list that most closely matches the user's query and assign those variable_descriptions to the variable_descriptions key. +If none clearly and obviously match, assign an empty string ''. + +Please select up to three variables from the variables list that most closely matches the user's query and assign those variables to the variable key. +If none clearly and obviously match, assign an empty string ''. + +Please select one and ONLY ONE source_id from the source_ids list that most closely matches the user's query and assign ONLY that source_id to the source_id key. +If none clearly and obviously match, assign an empty string ''." + +Please select one and ONLY ONE experiment_id from the experiment_ids list that most closely matches the user's query and assign ONLY that experiment_id to the experiment_id key." \ +If none clearly and obviously match, assign an empty string ''. + +Please select one and ONLY ONE nominal_resolution from the nominal_resolutions list that most closely matches the user's query and assign ONLY that nominal_resolution to the nominal_resolution key." \ +If none clearly and obviously match, assign an empty string ''. + +Please select one and ONLY ONE institution_id from the institution_ids list that most closely matches the user's query and assign ONLY that institution_id to the institution_id key." \ +If none clearly and obviously match, assign an empty string ''. + +Please select one and ONLY ONE variant_label from the variant_labels list that most closely matches the user's query and assign ONLY that variant_label to the variant_label key." \ +If none clearly and obviously match, assign an empty string ''. + +Please select one and ONLY ONE frequency from the frequencies list that most closely matches the user's query and assign ONLY that frequency to the frequency key." \ +If none clearly and obviously match, assign an empty string ''. + +Ensure that your response is properly formatted JSON please. + +DO NOT INCLUDE CODE TAGS OR A SPECIFIER FOR LANGUAGE. +YOUR OUTPUT MUST BE A VALID, PARSEABLE JSON OBJECT. + +Also, when you are selecting variable, source_id, experiment_id, nominal_resolution, institution_id, variant_label, and frequency make sure to select" \ +the most simple and obvious choice--no fancy footwork here please. +""" + + +SEARCH_FACETS = [ + "experiment_title", + "cf_standard_name", + "variable_long_name", + "variable_id", + "table_id", + "source_type", + "source_id", + "activity_id", + "nominal_resolution", + "frequency", + "realm", + "institution_id", + "variant_label", + "experiment_id", + "grid_label", + "nominal_resolution", + "frequency", +] + + +class ESGFProvider(): + def __init__(self, agent_fn): + print("initializing esgf search provider") + self.agent_fn = agent_fn + self.search_mirrors = [ + default_settings.esgf_url, + *default_settings.esgf_fallbacks.split(","), + ] + self.current_mirror_index = 0 + self.retries = 0 + self.max_retries = len(self.search_mirrors) + self.with_all_available_mirrors(self.get_facet_possiblities) + + async def tool_search(self, query: str): + return await self.search(query, 1, False) + + def tool_fetch(self, dataset_id: str): + urls = self.get_all_access_paths_by_id(dataset_id) + metadata = self.get_metadata_for_dataset(dataset_id) + return {"dataset": dataset_id, "urls": urls, "metadata": metadata} + + def increment_mirror(self): + self.current_mirror_index += 1 + self.current_mirror_index = self.current_mirror_index % len(self.search_mirrors) + + def with_all_available_mirrors(self, func, *args, **kwargs) -> Any: + self.retries = 0 + return_value = None + while self.retries < self.max_retries: + try: + return_value = func(*args, **kwargs) + break + except Exception as e: + print( + f"failed to run: retry {self.retries}, mirror: {self.search_mirrors[self.current_mirror_index]} with error '{str(e)}'", + flush=True, + ) + self.increment_mirror() + self.retries += 1 + if self.retries >= self.max_retries: + raise Exception(f"failed after {self.retries} retries: {e}") + return return_value + + def get_esgf_url_with_current_mirror(self) -> str: + mirror = self.search_mirrors[self.current_mirror_index] + return f"{mirror}/search" + + def get_facet_possiblities(self): + query = { + "project": "CMIP6", + "facets": ",".join(SEARCH_FACETS), + "limit": "0", + "format": "application/solr+json", + } + base_url = self.get_esgf_url_with_current_mirror() + response = requests.get(base_url, params=query) + if response.status_code >= 300: + msg = f"failed to fetch available facets: {response.status_code}, {response.content}" + raise Exception(msg) + facets = response.json() + self.facet_possibilities = facets["facet_counts"]["facet_fields"] + for facet, terms in self.facet_possibilities.items(): + self.facet_possibilities[facet] = terms[0::2] + + async def search(self, query: str, page: int, keywords: bool) -> dict[str, Any]: + """ + converts a natural language query to a list of ESGF dataset + metadata dictionaries by running a lucene query against the given + ESGF node in settings. + + keywords: pass keywords directly to ESGF with no LLM in the middle + """ + if keywords: + print(f"keyword searching for {query}", flush=True) + return self.keyword_search(query, page) + return await self.natural_language_search(query, page) + + def get_all_access_paths_by_id(self, dataset_id: str) -> AccessURLs: + return [ + self.with_all_available_mirrors(self.get_access_paths_by_id, id) + for id in self.with_all_available_mirrors( + self.get_mirrors_for_dataset, dataset_id + ) + ] + + def get_mirrors_for_dataset(self, dataset_id: str) -> list[str]: + # strip vert bar if provided with example mirror attached + dataset_id = dataset_id.split("|")[0] + response = self.run_esgf_dataset_query(f"id:{dataset_id}*", 1, {}) + full_ids = [d["id"] for d in response] + return full_ids + + def get_datasets_from_id(self, dataset_id: str) -> list[dict[str, Any]]: + """ + returns a list of datasets for a given ID. includes mirrors. + """ + if dataset_id == "": + return [] + params = urlencode( + { + "type": "File", + "format": "application/solr+json", + "dataset_id": dataset_id, + "limit": 200, + } + ) + base_url = self.get_esgf_url_with_current_mirror() + full_url = f"{base_url}?{params}" + r = requests.get(full_url) + response = r.json() + if r.status_code != 200: + raise ConnectionError( + f"Failed to extract files from dataset via file search: {full_url} {response}" + ) + datasets = response["response"]["docs"] + if len(datasets) == 0: + raise ConnectionError( + f"Failed to extract files from dataset: empty list {full_url}" + ) + return datasets + + def get_access_paths_by_id(self, dataset_id: str) -> dict[str, list[str]]: + """ + returns a list of OPENDAP URLs for use in processing given a dataset. + """ + files = self.get_datasets_from_id(dataset_id) + + # file url responses are lists of strings with their protocols separated by | + # e.x. https://esgf-node.example|mimetype|OPENDAP + def select(files, selector): + return [ + url.split("|")[0] + for url in itertools.chain.from_iterable([f["url"] for f in files]) + if selector in url + ] + + http_urls = select(files, "HTTP") + # sometimes the opendap request form is returned. we strip the trailing suffix if needed + opendap_urls = select(files, "OPENDAP") + opendap_urls = [u[:-5] if u.endswith(".nc.html") else u for u in opendap_urls] + + return {"opendap": opendap_urls, "http": http_urls} + + def get_metadata_for_dataset(self, dataset_id: str) -> dict[str, Any]: + """ + returns a list of OPENDAP URLs for use in processing given a dataset. + """ + datasets = self.get_datasets_from_id(dataset_id) + if len(datasets) == 0: + msg = "no datasets found for given ID" + raise ValueError(msg) + return datasets[0] + + def get_access_paths(self, dataset) -> AccessURLs: + return self.get_all_access_paths_by_id(dataset["id"]) + + def keyword_search(self, query: str, page: int) -> dict[str, Any]: + """ + converts a list of keywords to an ESGF query and runs it against the node. + """ + lucene_query_statements = ["AND", "OR", "(", ")"] + if any([query.find(substring) != -1 for substring in lucene_query_statements]): + datasets = self.run_esgf_dataset_query(query, page, options={}) + return {"query": {"raw": query}, "results": datasets} + else: + stripped_query = re.sub(r"[^A-Za-z0-9 ]+", "", query) + lucene_query = " AND ".join(stripped_query.split(" ")) + datasets = self.run_esgf_dataset_query(lucene_query, page, options={}) + return { + "query": { + "original": query, + "raw": lucene_query, + }, + "results": datasets, + } + + async def natural_language_search( + self, search_query: str, page: int, retries=0 + ) -> dict[str, Any]: + """ + converts to natural language and runs the result against the ESGF node, returning a list of datasets. + """ + search_terms_json = await self.process_natural_language(search_query) + search_terms_json = re.sub(r'`', '', search_terms_json) + try: + search_terms = json.loads(search_terms_json) + except ValueError as e: + print( + f"openAI returned more than just json, retrying query... \n {e} {search_terms_json}" + ) + if retries >= 3: + print("openAI returned non-json in multiple retries, exiting") + return { + "error": f"openAI returned non-json in multiple retries. raw text: {search_terms_json}" + } + return await self.natural_language_search(search_query, page, retries + 1) + query = " AND ".join( + [ + ( + search_term.strip() + if isinstance(search_term, str) + else "({})".format( + " OR ".join( + filter(lambda term: term.strip() != "", search_term) + ) + ) + ) + for search_term in filter( + lambda element: element != "", search_terms.values() + ) + ] + ) + datasets = self.with_all_available_mirrors( + self.run_esgf_dataset_query, query, page, options={} + ) + return { + "query": {"raw": query, "search_terms": search_terms}, + "results": datasets, + } + + def build_natural_language_prompt(self, search_query: str) -> str: + """ + wraps user input given to the LLM after the context. + """ + return "Convert the following input text: {}".format(search_query) + + async def process_natural_language(self, search_query: str) -> str: + """ + runs query against LLM and returns the result string. + """ + prompt = generate_natural_language_system_prompt( + self.facet_possibilities + ) + query = self.build_natural_language_prompt(search_query) + return await self.agent_fn(prompt, query) + + + def run_esgf_dataset_query( + self, query_string: str, page: int, options: dict[str, str] + ) -> DatasetSearchResults: + """ + runs the formatted apache lucene query against the ESGF node and returns the metadata in datasets. + """ + encoded_string = urlencode( + { + "query": query_string, + "project": "CMIP6", + "fields": "*", + "latest": "true", + "sort": "true", + "limit": f"{default_settings.entries_per_page}", + "offset": "{}".format(default_settings.entries_per_page * (page - 1)), + "format": "application/solr+json", + "distrib": "true", + } + | options + ) + + base_url = self.get_esgf_url_with_current_mirror() + full_url = f"{base_url}?{encoded_string}" + r = requests.get(full_url) + if r.status_code != 200: + error = str(r.content) + raise ConnectionError( + f"Failed to search against ESGF node: {full_url}: error from node upstream is: {r.status_code} {error}" + ) + response = r.json() + + # parallel over datasets, but delay fetching url until needed + return dask.compute( + [ + dask.delayed(lambda id: id)( + metadata, + ) + for metadata in response["response"]["docs"] + ] + )[0] diff --git a/src/beaker_climate/climate_python/README.md b/src/beaker_climate/climate_python/README.md deleted file mode 100644 index 358b421..0000000 --- a/src/beaker_climate/climate_python/README.md +++ /dev/null @@ -1,54 +0,0 @@ -# beaker-climate - an extension for [Beaker notebooks](https://github.com/jataware/beaker-kernel) - ------ - -## Table of Contents - -- [About Beaker](#about-beaker) -- [Installation](#installation) - - -## About Beaker - -Beaker provides Contextually-aware notebooks with built-in AI assistant. It is built atop Jupyter, leveraging the deep Jupyter ecosystem. - -It consists of multiple aspects, including: -- A server for hosting/running Beaker/Jupyter sessions. -- The Beaker kernel, an advanced Jupyter Kernel. -- Beaker-TS, a TypeScript/JavaScript library. -- A Vue based, reactive, extensible UI interface. -- Beaker-Vue, a Vue3 component library for building your own UIs with minimal hassle. - -Beaker can be extended with new [contexts](https://jataware.github.io/beaker-kernel/contexts.html) and [subkernels](https://jataware.github.io/beaker-kernel/subkernels.html) - -Learn more in the [Beaker documentation](https://jataware.github.io/beaker-kernel/). - -## Installation - -To add any contained contexts or subkernels to Beaker, you simply need to install this package. The provided elements will be available in Beaker upon next start. - -### PyPI install (if deployed) -```console -pip install beaker-climate -``` - -### beaker CLI (installs project in dev mode) -```console -beaker project update beaker-climate -``` - -### local pip dev mode install -```console -cd climate-python -pip install -e . -``` - -### local pip install -```console -cd climate-python -pip install . -``` - -### Note -Some changes, such as adding or moving a context require updating/reinstalling the project. -You should run `beaker project update` if you encounter issues after making updates to the project. diff --git a/src/beaker_climate/climate_python/agent.py b/src/beaker_climate/climate_python/agent.py deleted file mode 100644 index 3595462..0000000 --- a/src/beaker_climate/climate_python/agent.py +++ /dev/null @@ -1,389 +0,0 @@ -import json -import logging -import re -from typing import Optional -import codecs - -import pandas -import matplotlib.pyplot as plt -import xarray as xr - -from archytas.react import Undefined -from archytas.tool_utils import AgentRef, LoopControllerRef, ReactContextRef, tool - -from beaker_kernel.lib import BeakerAgent -from beaker_kernel.lib.context import BaseContext - -from pathlib import Path -from adhoc_api.tool import AdhocApi, APISpec - -logger = logging.getLogger(__name__) - -class MessageLogger(): - def __init__(self, context): - self.context = context - def info(self, message): - self.context.send_response("iopub", - "gemini_info", { - "body": message - }, - ) - def error(self, message): - self.context.send_response("iopub", - "gemini_error", { - "body": message - }, - ) - -class ClimateDataUtilityAgent(BeakerAgent): - - """ - You are assisting us in modifying geo-temporal datasets. - - The main things you are going to do are regridding spatial datasets, temporally rescaling datasets, and clipping the extent of geo-temporal datasets. - - If you don't have the details necessary to use a tool, you should use the ask_user tool to ask the user for them. - - """ - def __init__(self, context: BaseContext = None, tools: list = None, **kwargs): - super().__init__(context, tools, **kwargs) - self.here = Path(__file__).parent - self.logger = MessageLogger(self.context) - try: - self.esgf_api_adhoc = AdhocApi(apis=[self.get_esgf_api()], - drafter_config={'model': 'gemini-1.5-pro-001', 'ttl_seconds': 3600}, - finalizer_config={'model': 'gpt-4o'}, - logger=self.logger, - # run_code=python.run # don't include so top level agent will run the code itself - ) - except ValueError as e: - self.esgf_api_adhoc = None - - def get_esgf_api(self) -> APISpec: - documentation = (self.here/'api_documentation'/'esgf_rest_documentation.md').read_text() - ESGF_DESCRIPTION = '''\ - The Earth System Grid Federation (ESGF) is a global collaboration that manages and distributes climate and environmental science data. - It serves as the primary platform for accessing CMIP (Coupled Model Intercomparison Project) data and other climate model outputs. - The federation provides a distributed database and delivery system for climate science data, particularly model outputs and observational data. - Through ESGF, users can search, discover and access climate datasets from major modeling centers and research institutions worldwide. - The system supports authentication, search capabilities, and data transfer protocols optimized for large scientific datasets. - ''' - - ESGF_ADDITIONAL_INFO_REST = '''\ - For download/OpenDAP URLs, the Thredds catalog URL is now DEPRECATED. If you see a URL like: - - https://aims3.llnl.gov/thredds/catalog/esgcet/306/CMIP6.ScenarioMIP.NCAR.CESM2-WACCM.ssp585.r1i1p1f1.Oday.tos.gr.v20190815.xml#CMIP6.ScenarioMIP.NCAR.CESM2-WACCM.ssp585.r1i1p1f1.Oday.tos.gr.v20190815 - - You should reformat it to something like: - - http://aims3.llnl.gov/thredds/dodsC/cmip6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/Oday/tos/gr/v20190815/tos_Oday_CESM2-WACCM_ssp585_r1i1p1f1_gr_20150102-21010101.nc - - Additionally, any data downloaded should be downloaded to the './data/' directory. - Please ensure the code makes sure this location exists, and all downloaded data is saved to this location. - ''' - - # ESGF_ADDITIONAL_INFO = '''\ - # Be sure to import and instantiate the client for the ESGF API. For example: - # ```python - # from pyesgf.search import SearchConnection - # ``` - - # You should always use http://esgf-node.llnl.gov/esg-search as the search node unless it times out. - - # When performing a search, you MUST always specify the facets as its own argument. For example: - - # ```python - # facets='project,experiment_family' - # ctx = conn.new_context(project='CMIP5', query='humidity', facets=facets) - # ctx.hit_count - # ``` - - # In a SEARCH, if the user asks you to find something (e.g. humidity, precipitation, etc.), you should use the query argument. - # You should NEVER use the variable or experiment_id parameters, they are just way too specific. Stuff as much as you can - # into the query parameter and work with the user to refine the query over time. Never, EVER print all the results of a search, - # it could be HUGE. Collect the results into a variable and slice some for presentation to the user. Refer to the search results data - # model for more information on how to work with it. Note that the only attribute on a search result `DatasetResult` - # is `dataset_id`, so if you want to capture the results, you can iterate through the results and collect the `dataset_id` of each - # result. Just note that search results are an iterable, not a list, so you should loop over the first ~10 to 100 results to get a good sample. - # You can't just slice them! You can check the number of results by calling `ctx.hit_count` which is wise to do before collecting all results. - - # For other things, like getting more detail about a dataset or downloading a dataset you MUST - # use the instructions available to you in the associated API documentation. - - # Additionally, any data downloaded should be downloaded to the './data/' directory. - # Please ensure the code makes sure this location exists, and all downloaded data is saved to this location. - # ''' - - esgf_api_spec: APISpec = { - 'name': "Earth System Grid Federation (ESGF)", - 'cache_key': 'api_assistant_esgf_client', - 'description': ESGF_DESCRIPTION, - 'documentation': documentation, - 'proofread_instructions': ESGF_ADDITIONAL_INFO_REST - } - return esgf_api_spec - - - @tool() - async def use_esgf_api(self, goal: str, agent: AgentRef, loop: LoopControllerRef, react_context: ReactContextRef) -> str: - """ - This tool should be used to submit a request to the ESGF API. This can be used - for searching for datasets, downloading datasets, etc. This can include climate data such - as CMIP5, CMIP6, etc. - - Args: - goal (str): The goal of the interaction with the ESGF API. - - Returns: - str: The code generated as a result of the ESGF API request. - """ - name = "Earth System Grid Federation (ESGF)" - code = self.esgf_api_adhoc.use_api(name, goal) - self.logger.info(f"running code produced by esgf ad hoc api client: {code}") - try: - result = await self.run_code(code, agent=agent, react_context=react_context) - return result - except Exception as e: - self.logger.error(f"error in using ESGF client api: {e}") - raise e - - async def run_code(self, code: str, agent: AgentRef, react_context: ReactContextRef) -> str: - """ - Executes code in the user's notebook on behalf of the user, but collects the outputs of the run for use by the Agent - in the ReAct loop, if needed. - - The code runs in a new codecell and the user can watch the execution and will see all of the normal output in the - Jupyter interface. - - This tool can be used to probe the user's environment or collect information to answer questions, or can be used to - run code completely on behalf of the user. If a user asks the agent to do something that reasonably should be done - via code, you should probably default to using this tool. - - This tool can be run more than once in a react loop. All actions and variables created in earlier uses of the tool - in a particular loop should be assumed to exist for future uses of the tool in the same loop. - - Args: - code (str): Code to run directly in Jupyter. This should be a string exactly as it would appear in a notebook - codecell. No extra escaping of newlines or similar characters is required. - Returns: - str: A summary of the run, along with the collected stdout, stderr, returned result, display_data items, and any - errors that may have occurred. - """ - self.logger.info(f"used runcode2: {code}") - def format_execution_context(context) -> str: - """ - Formats the execution context into a format that is easy for the agent to parse and understand. - """ - stdout_list = context.get("stdout_list") - stderr_list = context.get("stderr_list") - display_data_list = context.get("display_data_list") - error = context.get("error") - return_value = context.get("return") - - success = context['done'] and not context['error'] - if context['result']['status'] == 'error': - success = False - error = context['result'] - ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') - error['traceback'] = ansi_escape.sub('', error['traceback']) - - output = [ - """Execution report:""", - f"""Execution id: {context['id']}""", - f"""Successful?: {success}""", - f"""Code executed: - ``` - {context['command']} - ```\n""", - ] - - if error: - output.extend([ - "The following error was thrown when executing the code", - " Error:", - f" {error['ename']} {error['evalue']}", - " TraceBack:", - "\n".join(error['traceback']), - "", - ]) - - - if stdout_list: - output.extend([ - "The execution produced the following stdout output:", - "\n".join(["```", *stdout_list, "```\n"]), - ]) - if stderr_list: - output.extend([ - "The execution produced the following stderr output:", - "\n".join(["```", *stderr_list, "```\n"]), - ]) - if display_data_list: - output.append( - "The execution produced the following `display_data` objects to display in the notebook:", - ) - for idx, display_data in enumerate(display_data_list): - output.append( - f"display_data item {idx}:" - ) - for mimetype, value in display_data.items(): - if len(value) > 800: - value = f"{value[:400]} ... truncated ... {value[-400:]}" - output.append( - f"{mimetype}:" - ) - output.append( - f"```\n{value}\n```\n" - ) - if return_value: - output.append( - "The execution returned the following:", - ) - if isinstance(return_value, str): - output.extend([ - '```', return_value, '```\n' - ]) - output.append("Execution Report Complete") - return "\n".join(output) - - # TODO: In future, this may become a parameter and we allow the agent to decide if code should be automatically run - # or just be added. - autoexecute = True - message = react_context.get("message", None) - identities = getattr(message, 'identities', []) - try: - execution_task = None - checkpoint_index, execution_task = await agent.context.subkernel.checkpoint_and_execute( - code, not autoexecute, parent_header=message.header, identities=identities - ) - execute_request_msg = { - name: getattr(execution_task.execute_request_msg, name) - for name in execution_task.execute_request_msg.json_field_names - } - agent.context.send_response( - "iopub", - "add_child_codecell", - { - "action": "code_cell", - "language": agent.context.subkernel.SLUG, - "code": code.strip(), - "autoexecute": autoexecute, - "execute_request_msg": execute_request_msg, - "checkpoint_index": checkpoint_index, - }, - parent_header=message.header, - parent_identities=getattr(message, "identities", None), - ) - - execution_context = await execution_task - except Exception as err: - logger.error(err, exc_info=err) - raise - return format_execution_context(execution_context) - - @tool() - async def regrid_dataset( - self, - dataset: str, - target_resolution: tuple, - agent: AgentRef, - loop: LoopControllerRef, - aggregation: Optional[str] = "interp_or_mean", - ) -> str: - """ - This tool should be used to show the user code to regrid a netcdf dataset with detectable geo-resolution. - - If a user asks to regrid a dataset, use this tool to return them code to regrid the dataset. - - If you are given a netcdf dataset, use this tool instead of any other regridding tool. - - If you are asked about what is needed to regrid a dataset, please provide information about the arguments of this tool. - - Args: - dataset (str): The name of the dataset instantiated in the jupyter notebook. - target_resolution (tuple): The target resolution to regrid to, e.g. (0.5, 0.5). This is in degrees longitude and latitude. - aggregation (Optional): The aggregation function to be used in the regridding. The options are as follows: - 'conserve' - 'min' - 'max' - 'mean' - 'median' - 'mode' - 'interp_or_mean' - 'nearest_or_mode' - - Returns: - str: Status of whether or not the dataset has been persisted to the HMI server. - """ - - loop.set_state(loop.STOP_SUCCESS) - code = agent.context.get_code( - "flowcast_regridding", - { - "dataset": dataset, - "target_resolution": target_resolution, - "aggregation": aggregation, - }, - ) - - result = json.dumps( - { - "action": "code_cell", - "language": "python3", - "content": code.strip(), - } - ) - - return result - - @tool() - async def get_netcdf_plot( - self, - dataset_variable_name: str, - agent: AgentRef, - loop: LoopControllerRef, - plot_variable_name: Optional[str] = None, - lat_col: Optional[str] = "lat", - lon_col: Optional[str] = "lon", - time_slice_index: Optional[int] = 1, - ) -> str: - """ - This function should be used to get a plot of a netcdf dataset. - - This function should also be used to preview any netcdf dataset. - - If the user asks to plot or preview a dataset, use this tool to return plotting code to them. - - You should also ask if the user wants to specify the optional arguments by telling them what each argument does. - - Args: - dataset_variable_name (str): The name of the dataset instantiated in the jupyter notebook. - plot_variable_name (Optional): The name of the variable to plot. Defaults to None. - If None is provided, the first variable in the dataset will be plotted. - lat_col (Optional): The name of the latitude column. Defaults to 'lat'. - lon_col (Optional): The name of the longitude column. Defaults to 'lon'. - time_slice_index (Optional): The index of the time slice to visualize. Defaults to 1. - - Returns: - str: The code used to plot the netcdf. - """ - - code = agent.context.get_code( - "get_netcdf_plot", - { - "dataset": dataset_variable_name, - "plot_variable_name": plot_variable_name, - "lat_col": lat_col, - "lon_col": lon_col, - "time_slice_index": time_slice_index, - }, - ) - - result = await agent.context.evaluate( - code, - parent_header={}, - ) - - output = result.get("return") - - return output \ No newline at end of file diff --git a/src/beaker_climate/climate_python/api_documentation/esgf_documentation.md b/src/beaker_climate/climate_python/api_documentation/esgf_documentation.md deleted file mode 100644 index f8a775c..0000000 --- a/src/beaker_climate/climate_python/api_documentation/esgf_documentation.md +++ /dev/null @@ -1,1767 +0,0 @@ -# ESGF Python Client Documentation - - ---- -subtitle: Search Concepts -title: Design Concepts ---- - -The `pyesgf.search` interface to ESGF search reflects the typical workflow of a user navigating through the sets of facets categorising available data. - -# Keyword classification - -The keyword arguments described in the [ESGF Search API](https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API) have a wide veriety of roles within the search workflow. To reflect this `pyesgf.search` classifies these keywords into system, spatiotemporal and facet keywords. Responsibility for these keywords are distributes across several classes. - -## System keywords - -| API keyword | class | Notes | -|-------------|------------------|--------------------------------------------------------------------------------------| -| limit | SearchConnection | Set in `SearchConnection:send_query` method or transparently through `SearchContext` | -| offset | SearchConnection | Set in `SearchConnection:send_query` method or transparently through `SearchContext` | -| shards | SearchConnection | Set in constructor | -| distrib | SearchConnection | Set in constructor | -| latest | SearchContext | Set in constructor | -| facets | SearchContext | Set in constructor | -| fields | SearchContext | Set in constructor | -| replica | SearchContext | Set in constructor | -| type | SearchContext | Create contexts with the right type using `ResultSet.file_context`, etc. | -| from | SearchContext | Set in constructor. Use "from_timestamp" in the context API. | -| to | SearchContext | Set in constructor. Use "to_timestamp" in the context API. | -| fields | n/a | Managed internally | -| format | n/a | Managed internally | -| id | n/a | Managed internally | - -## Temporal keywords - -Temporal keywords are supported for Dataset search. The terms "from_timestamp" and "to_timestamp" should be used with values following the format "YYYY-MM-DDThh:mm:ssZ". - -## Spatial keywords - -Spatial keywords are not yet supported by `pyesgf.search` however the API does have placeholders for these keywords anticipating future implementation: - -## Facet keywords - -All other keywords are considered to be search facets. The keyword "query" is dealt with specially as a freetext facet. - -# Main Classes - -## SearchConnection - -`SearchConnection` instances represent a connection to an ESGF Search web service. This stores the service URL and also service-level parameters like distrib and shards. - -## SearchContext - -`SearchContext` represents the constraints on a given search. This includes the type of records you are searching for (File or Dataset), the list of possible facets with or without facet counts (depending on how the instance is created), currently selected facets/search-terms. Instances can return the number of hits and facet-counts associated with the current search. - -SearchContext objects can be created in several ways: - -> 1. From a SearchConnection object using the method `SearchConnection.new_context` -> 2. By further constraining an existing FacetContext object. E.g. new_context = context.constrain(institute='IPSL'). -> 3. From a Result object using one of it's *foo_context()* methods to create a context for searching for results related to the Result. -> 4. Future development may implement project-specific factory. E.g. CMIP5FacetContext(). - -## ResultSet - -`ResultSet` instances are returned by the `SearchContext.search` method and represent the results from a query. They supports transparent paging of results with a client-side cache. - -## Result - -`Result` instances represent the result record in the SOLr response. They are subclassed to represent records of different types: `FileResult` and `DatasetResult`. Results have various properties exposing information about the objects they represent. e.g. dataset_id, checksum, filename, size, etc. - - ---- - - -# ESGF Python Client API Reference - - -# API Reference - - -# API Reference¶ - - -```python ->>> lm = LogonManager() ->>> lm.is_logged_on() -False ->>> lm.logon(username, password, myproxy_hostname, bootstrap=True) ->>> lm.is_logged_on() -True -``` - - -```python ->>> lm.logoff() ->>> lm.is_logged_on() -False ->>> lm.logon_with_openid(openid, password, bootstrap=True) ->>> lm.is_logged_on() -True -``` - - -> **Warning:** -> Warning -Prior to v0.1.1 the url parameter expected the full URL of the -search endpoint up to the query string. This has now been changed -to expect url to ommit the final endpoint name, -e.g. https://esgf-node.llnl.gov/esg-search/search should be changed -to https://esgf-node.llnl.gov/esg-search in client code. The -current implementation detects the presence of /search and -corrects the URL to retain backward compatibility but this feature -may not remain in future versions. - - -- **class pyesgf.search.connection.SearchConnection(url, distrib=True, cache=None, timeout=120, expire_after=datetime.timedelta(seconds=3600), session=None, verify=True, context_class=None)[source]¶** – Variables - -url – The URL to the Search API service. This should be the URL -of the ESGF search service excluding the final endpoint name. -Usually this is http:///esg-search -distrib – Boolean stating whether searches through this connection are -distributed. i.e. whether the Search service distributes the query to -other search peers. See also the documentation for the facets -argument to pyesgf.search.context.SearchContext in relation to -distributed searches. -cache – Path to sqlite cache file. Cache expires every hours. -timeout – Time (in seconds) before query returns an error. -Default: 120s. -expire_after – Time delta after cache expires. Default: 1 hour. -session – requests.session object. optional. -verify – boolean, determines if query should be sent over a verified -channel. - - - - - -get_shard_list()[source]¶ -return the list of all available shards. A subset of the returned list -can be supplied to ‘send_query()’ to limit the query to selected -shards. -Shards are described by hostname and mapped to SOLr shard descriptions -internally. - -Returns -the list of available shards - - - - - -new_context(context_class=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None, search_type=None, **constraints)[source]¶ -Returns a pyesgf.search.context.SearchContext class for -performing faceted searches. -See SearchContext.__init__() for documentation on the -arguments. - - - -send_search(query_dict, limit=None, offset=None, shards=None)[source]¶ -Send a query to the “search” endpoint. -See send_query() for details. - -Returns -The json document for the search results - - - - - -send_wget(query_dict, shards=None)[source]¶ -Send a query to the “search” endpoint. -See send_query() for details. - -Returns -A string containing the script. - -- **Variables** – url – The URL to the Search API service. This should be the URL -of the ESGF search service excluding the final endpoint name. -Usually this is http:///esg-search -distrib – Boolean stating whether searches through this connection are -distributed. i.e. whether the Search service distributes the query to -other search peers. See also the documentation for the facets -argument to pyesgf.search.context.SearchContext in relation to -distributed searches. -cache – Path to sqlite cache file. Cache expires every hours. -timeout – Time (in seconds) before query returns an error. -Default: 120s. -expire_after – Time delta after cache expires. Default: 1 hour. -session – requests.session object. optional. -verify – boolean, determines if query should be sent over a verified -channel. - -- **get_shard_list()[source]¶** – return the list of all available shards. A subset of the returned list -can be supplied to ‘send_query()’ to limit the query to selected -shards. -Shards are described by hostname and mapped to SOLr shard descriptions -internally. - -Returns -the list of available shards - -- **Returns** – the list of available shards - -- **new_context(context_class=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None, search_type=None, **constraints)[source]¶** – Returns a pyesgf.search.context.SearchContext class for -performing faceted searches. -See SearchContext.__init__() for documentation on the -arguments. - -- **send_search(query_dict, limit=None, offset=None, shards=None)[source]¶** – Send a query to the “search” endpoint. -See send_query() for details. - -Returns -The json document for the search results - -- **Returns** – The json document for the search results - -- **send_wget(query_dict, shards=None)[source]¶** – Send a query to the “search” endpoint. -See send_query() for details. - -Returns -A string containing the script. - -- **Returns** – A string containing the script. - - - -- **Variables** – url – The URL to the Search API service. This should be the URL -of the ESGF search service excluding the final endpoint name. -Usually this is http:///esg-search -distrib – Boolean stating whether searches through this connection are -distributed. i.e. whether the Search service distributes the query to -other search peers. See also the documentation for the facets -argument to pyesgf.search.context.SearchContext in relation to -distributed searches. -cache – Path to sqlite cache file. Cache expires every hours. -timeout – Time (in seconds) before query returns an error. -Default: 120s. -expire_after – Time delta after cache expires. Default: 1 hour. -session – requests.session object. optional. -verify – boolean, determines if query should be sent over a verified -channel. - - - -- **get_shard_list()[source]¶** – return the list of all available shards. A subset of the returned list -can be supplied to ‘send_query()’ to limit the query to selected -shards. -Shards are described by hostname and mapped to SOLr shard descriptions -internally. - -Returns -the list of available shards - -- **Returns** – the list of available shards - - - -- **Returns** – the list of available shards - - - -- **new_context(context_class=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None, search_type=None, **constraints)[source]¶** – Returns a pyesgf.search.context.SearchContext class for -performing faceted searches. -See SearchContext.__init__() for documentation on the -arguments. - - - -- **send_search(query_dict, limit=None, offset=None, shards=None)[source]¶** – Send a query to the “search” endpoint. -See send_query() for details. - -Returns -The json document for the search results - -- **Returns** – The json document for the search results - - - -- **Returns** – The json document for the search results - - - -- **send_wget(query_dict, shards=None)[source]¶** – Send a query to the “search” endpoint. -See send_query() for details. - -Returns -A string containing the script. - -- **Returns** – A string containing the script. - - - -- **Returns** – A string containing the script. - - - -- **pyesgf.search.connection.create_single_session(cache=None, expire_after=datetime.timedelta(seconds=3600), **kwargs)[source]¶** – Simple helper function to start a requests or requests_cache session. -cache, if specified is a filename to a threadsafe sqlite database -expire_after specifies how long the cache should be kept - - - -- **pyesgf.search.connection.query_keyword_type(keyword)[source]¶** – Returns the keyword type of a search query keyword. -Possible values are ‘system’, ‘freetext’, ‘facet’, ‘temporal’ and -‘geospatial’. If the keyword is unknown it is assumed to be a -facet keyword - - - -- **class pyesgf.search.context.AggregationSearchContext(connection, constraints, search_type=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None)[source]¶** – - - - -- **class pyesgf.search.context.DatasetSearchContext(connection, constraints, search_type=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None)[source]¶** – - - - -- **class pyesgf.search.context.FileSearchContext(connection, constraints, search_type=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None)[source]¶** – - - - -- **class pyesgf.search.context.SearchContext(connection, constraints, search_type=None, latest=None, facets=None, fields=None, from_timestamp=None, to_timestamp=None, replica=None, shards=None)[source]¶** – Instances of this class represent the state of a current search. -It exposes what facets are available to select and the facet counts -if they are available. -Subclasses of this class can restrict the search options. For instance -FileSearchContext, DatasetSerachContext or CMIP5SearchContext -SearchContext instances are connected to SearchConnection instances. You -normally create SearchContext instances via one of: -1. Calling SearchConnection.new_context() -2. Calling SearchContext.constrain() - -Variables - -constraints – A dictionary of facet constraints currently in effect. -constraint[facet_name] = [value, value, ...] -facets – A string containing a comma-separated list of facets to be -returned (for example 'source_id,ensemble_id'). If set, this will -be used to select which facet counts to include, as returned in the -facet_counts dictionary. Defaults to including all available -facets, but with distributed searches (where the SearchConnection -instance was created with distrib=True), some results may be -missing for server-side reasons when requesting all facets, so a -warning message will be issued. This contains further details. - - -Property facet_counts -A dictionary of available hits with each -facet value for the search as currently constrained. -This property returns a dictionary of dictionaries where -facet_counts[facet][facet_value] == hit_count - -Property hit_count -The total number of hits available with current -constraints. - - - - -constrain(**constraints)[source]¶ -Return a new instance with the additional constraints. - - - -get_download_script(**constraints)[source]¶ -Download a script for downloading all files in the set of results. - -Parameters -constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).get_download_script() - -Returns -A string containing the script - - - - - -get_facet_options()[source]¶ -Return a dictionary of facet counts filtered to remove all -facets that are completely constrained. This method is -similar to the property facet_counts except facet values -which are not relevant for further constraining are removed. - - - -search(batch_size=50, ignore_facet_check=False, **constraints)[source]¶ -Perform the search with current constraints returning a set of results. - -Batch_size -The number of results to get per HTTP request. - -Ignore_facet_check -Do not make an extra HTTP request to populate -facet_counts and hit_count. - -Parameters -constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).search() - -Returns -A ResultSet for this query - -- **Variables** – constraints – A dictionary of facet constraints currently in effect. -constraint[facet_name] = [value, value, ...] -facets – A string containing a comma-separated list of facets to be -returned (for example 'source_id,ensemble_id'). If set, this will -be used to select which facet counts to include, as returned in the -facet_counts dictionary. Defaults to including all available -facets, but with distributed searches (where the SearchConnection -instance was created with distrib=True), some results may be -missing for server-side reasons when requesting all facets, so a -warning message will be issued. This contains further details. - -- **Property facet_counts** – A dictionary of available hits with each -facet value for the search as currently constrained. -This property returns a dictionary of dictionaries where -facet_counts[facet][facet_value] == hit_count - -- **Property hit_count** – The total number of hits available with current -constraints. - -- **constrain(**constraints)[source]¶** – Return a new instance with the additional constraints. - -- **get_download_script(**constraints)[source]¶** – Download a script for downloading all files in the set of results. - -Parameters -constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).get_download_script() - -Returns -A string containing the script - -- **Parameters** – constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).get_download_script() - -- **Returns** – A string containing the script - -- **get_facet_options()[source]¶** – Return a dictionary of facet counts filtered to remove all -facets that are completely constrained. This method is -similar to the property facet_counts except facet values -which are not relevant for further constraining are removed. - -- **search(batch_size=50, ignore_facet_check=False, **constraints)[source]¶** – Perform the search with current constraints returning a set of results. - -Batch_size -The number of results to get per HTTP request. - -Ignore_facet_check -Do not make an extra HTTP request to populate -facet_counts and hit_count. - -Parameters -constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).search() - -Returns -A ResultSet for this query - -- **Batch_size** – The number of results to get per HTTP request. - -- **Ignore_facet_check** – Do not make an extra HTTP request to populate -facet_counts and hit_count. - -- **Parameters** – constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).search() - -- **Returns** – A ResultSet for this query - - - -- **Variables** – constraints – A dictionary of facet constraints currently in effect. -constraint[facet_name] = [value, value, ...] -facets – A string containing a comma-separated list of facets to be -returned (for example 'source_id,ensemble_id'). If set, this will -be used to select which facet counts to include, as returned in the -facet_counts dictionary. Defaults to including all available -facets, but with distributed searches (where the SearchConnection -instance was created with distrib=True), some results may be -missing for server-side reasons when requesting all facets, so a -warning message will be issued. This contains further details. - -- **Property facet_counts** – A dictionary of available hits with each -facet value for the search as currently constrained. -This property returns a dictionary of dictionaries where -facet_counts[facet][facet_value] == hit_count - -- **Property hit_count** – The total number of hits available with current -constraints. - - - -- **constrain(**constraints)[source]¶** – Return a new instance with the additional constraints. - - - -- **get_download_script(**constraints)[source]¶** – Download a script for downloading all files in the set of results. - -Parameters -constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).get_download_script() - -Returns -A string containing the script - -- **Parameters** – constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).get_download_script() - -- **Returns** – A string containing the script - - - -- **Parameters** – constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).get_download_script() - -- **Returns** – A string containing the script - - - -- **get_facet_options()[source]¶** – Return a dictionary of facet counts filtered to remove all -facets that are completely constrained. This method is -similar to the property facet_counts except facet values -which are not relevant for further constraining are removed. - - - -- **search(batch_size=50, ignore_facet_check=False, **constraints)[source]¶** – Perform the search with current constraints returning a set of results. - -Batch_size -The number of results to get per HTTP request. - -Ignore_facet_check -Do not make an extra HTTP request to populate -facet_counts and hit_count. - -Parameters -constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).search() - -Returns -A ResultSet for this query - -- **Batch_size** – The number of results to get per HTTP request. - -- **Ignore_facet_check** – Do not make an extra HTTP request to populate -facet_counts and hit_count. - -- **Parameters** – constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).search() - -- **Returns** – A ResultSet for this query - - - -- **Batch_size** – The number of results to get per HTTP request. - -- **Ignore_facet_check** – Do not make an extra HTTP request to populate -facet_counts and hit_count. - -- **Parameters** – constraints – Further constraints for this query. Equivalent -to calling self.constrain(**constraints).search() - -- **Returns** – A ResultSet for this query - - - -- **class pyesgf.search.results.AggregationResult(json, context)[source]¶** – A result object for ESGF aggregations. Properties from BaseResultare inherited. - - - -Property aggregation_id -The aggregation id - -- **A result object for ESGF aggregations. Properties from BaseResult** – are inherited. - -- **Property aggregation_id** – The aggregation id - - - -- **A result object for ESGF aggregations. Properties from BaseResult** – are inherited. - - - -- **Property aggregation_id** – The aggregation id - - - -- **class pyesgf.search.results.BaseResult(json, context)[source]¶** – Base class for results. -Subclasses represent different search types such as File and Dataset. - -Variables - -json – The original json representation of the result. -context – The SearchContext which generated this result. - - -Property urls -a dictionary of the form -{service: [(url, mime_type), ...], ...} - -Property opendap_url -The url of an OPeNDAP endpoint for this result -if available - -Property las_url -The url of an LAS endpoint for this result if available - -Property download_url -The url for downloading the result by HTTP -if available - -Property gridftp_url -The url for downloading the result by Globus -if available - -Property globus_url -The url for downloading the result by Globus -if available (including endpoint) - -Property index_node -The index node from where the metadata is stored. -Calls to *_context() will optimise queries to only address this node. - -- **Variables** – json – The original json representation of the result. -context – The SearchContext which generated this result. - -- **Property urls** – a dictionary of the form -{service: [(url, mime_type), ...], ...} - -- **Property opendap_url** – The url of an OPeNDAP endpoint for this result -if available - -- **Property las_url** – The url of an LAS endpoint for this result if available - -- **Property download_url** – The url for downloading the result by HTTP -if available - -- **Property gridftp_url** – The url for downloading the result by Globus -if available - -- **Property globus_url** – The url for downloading the result by Globus -if available (including endpoint) - -- **Property index_node** – The index node from where the metadata is stored. -Calls to *_context() will optimise queries to only address this node. - - - -- **Variables** – json – The original json representation of the result. -context – The SearchContext which generated this result. - -- **Property urls** – a dictionary of the form -{service: [(url, mime_type), ...], ...} - -- **Property opendap_url** – The url of an OPeNDAP endpoint for this result -if available - -- **Property las_url** – The url of an LAS endpoint for this result if available - -- **Property download_url** – The url for downloading the result by HTTP -if available - -- **Property gridftp_url** – The url for downloading the result by Globus -if available - -- **Property globus_url** – The url for downloading the result by Globus -if available (including endpoint) - -- **Property index_node** – The index node from where the metadata is stored. -Calls to *_context() will optimise queries to only address this node. - - - -- **class pyesgf.search.results.DatasetResult(json, context)[source]¶** – A result object for ESGF datasets. - -Property dataset_id -The solr dataset_id which is unique throughout the -system. - - - - -aggregation_context()[source]¶ -Return a SearchContext for searching for aggregations within this -dataset. - - - -file_context()[source]¶ -Return a SearchContext for searching for files within this dataset. - - - -property number_of_files¶ -Returns file count as reported by the dataset record. - -- **Property dataset_id** – The solr dataset_id which is unique throughout the -system. - -- **aggregation_context()[source]¶** – Return a SearchContext for searching for aggregations within this -dataset. - -- **file_context()[source]¶** – Return a SearchContext for searching for files within this dataset. - -- **property number_of_files¶** – Returns file count as reported by the dataset record. - - - -- **Property dataset_id** – The solr dataset_id which is unique throughout the -system. - - - -- **aggregation_context()[source]¶** – Return a SearchContext for searching for aggregations within this -dataset. - - - -- **file_context()[source]¶** – Return a SearchContext for searching for files within this dataset. - - - -- **property number_of_files¶** – Returns file count as reported by the dataset record. - - - -- **class pyesgf.search.results.FileResult(json, context)[source]¶** – A result object for ESGF files. Properties from BaseResult areinherited. - - - -Property file_id -The identifier for the file - -Property checksum -The checksum of the file - -Property checksum_type -The algorithm used for generating the checksum - -Property filename -The filename - -Property size -The file size in bytes - -- **A result object for ESGF files. Properties from BaseResult are** – inherited. - -- **Property file_id** – The identifier for the file - -- **Property checksum** – The checksum of the file - -- **Property checksum_type** – The algorithm used for generating the checksum - -- **Property filename** – The filename - -- **Property size** – The file size in bytes - - - -- **A result object for ESGF files. Properties from BaseResult are** – inherited. - - - -- **Property file_id** – The identifier for the file - -- **Property checksum** – The checksum of the file - -- **Property checksum_type** – The algorithm used for generating the checksum - -- **Property filename** – The filename - -- **Property size** – The file size in bytes - - - -- **class pyesgf.search.results.ResultSet(context, batch_size=50, eager=True)[source]¶** – Variables -context – The search context object used to generate this resultset - -Property batch_size -The number of results that will be requested -from esgf-search as one call. This must be set on creation and -cannot change. - -- **Variables** – context – The search context object used to generate this resultset - -- **Property batch_size** – The number of results that will be requested -from esgf-search as one call. This must be set on creation and -cannot change. - - - -- **Variables** – context – The search context object used to generate this resultset - -- **Property batch_size** – The number of results that will be requested -from esgf-search as one call. This must be set on creation and -cannot change. - - - -- **class pyesgf.logon.LogonManager(esgf_dir='/home/docs/.esg', dap_config='/home/docs/.dodsrc', verify=True)[source]¶** – Manages ESGF crendentials and security configuration files. -Also integrates with NetCDF’s secure OPeNDAP configuration. - - -logoff(clear_trustroots=False)[source]¶ -Remove any obtained credentials from the ESGF environment. - -Parameters -clear_trustroots – If True also remove trustroots. - - - - - -logon(username=None, password=None, hostname=None, bootstrap=False, update_trustroots=True, interactive=True)[source]¶ -Obtain ESGF credentials from the specified MyProxy service. -If interactive == True then any missing parameters of password, -username or hostname will be prompted for at the terminal. - -Parameters - -interactive – Whether to ask for input at the terminal for -any missing information. I.e. username, password or hostname. -bootstrap – Whether to bootstrap the trustroots for this -MyProxy service. -update_trustroots – Whether to update the trustroots for this -MyProxy service. - - - - - - -logon_with_openid(openid, password=None, bootstrap=False, update_trustroots=True, interactive=True)[source]¶ -Obtains ESGF credentials by detecting the MyProxy parameters from -the users OpenID. Some ESGF compatible OpenIDs do not contain enough -information to obtain credentials. In this case the user is prompted -for missing information if interactive == True, otherwise an -exception is raised. - -Parameters -openid – OpenID to login with See logon() for parameters -interactive, bootstrap and update_trustroots. - -- **logoff(clear_trustroots=False)[source]¶** – Remove any obtained credentials from the ESGF environment. - -Parameters -clear_trustroots – If True also remove trustroots. - -- **Parameters** – clear_trustroots – If True also remove trustroots. - -- **logon(username=None, password=None, hostname=None, bootstrap=False, update_trustroots=True, interactive=True)[source]¶** – Obtain ESGF credentials from the specified MyProxy service. -If interactive == True then any missing parameters of password, -username or hostname will be prompted for at the terminal. - -Parameters - -interactive – Whether to ask for input at the terminal for -any missing information. I.e. username, password or hostname. -bootstrap – Whether to bootstrap the trustroots for this -MyProxy service. -update_trustroots – Whether to update the trustroots for this -MyProxy service. - -- **Parameters** – interactive – Whether to ask for input at the terminal for -any missing information. I.e. username, password or hostname. -bootstrap – Whether to bootstrap the trustroots for this -MyProxy service. -update_trustroots – Whether to update the trustroots for this -MyProxy service. - -- **logon_with_openid(openid, password=None, bootstrap=False, update_trustroots=True, interactive=True)[source]¶** – Obtains ESGF credentials by detecting the MyProxy parameters from -the users OpenID. Some ESGF compatible OpenIDs do not contain enough -information to obtain credentials. In this case the user is prompted -for missing information if interactive == True, otherwise an -exception is raised. - -Parameters -openid – OpenID to login with See logon() for parameters -interactive, bootstrap and update_trustroots. - -- **Parameters** – openid – OpenID to login with See logon() for parameters -interactive, bootstrap and update_trustroots. - - - -- **logoff(clear_trustroots=False)[source]¶** – Remove any obtained credentials from the ESGF environment. - -Parameters -clear_trustroots – If True also remove trustroots. - -- **Parameters** – clear_trustroots – If True also remove trustroots. - - - -- **Parameters** – clear_trustroots – If True also remove trustroots. - - - -- **logon(username=None, password=None, hostname=None, bootstrap=False, update_trustroots=True, interactive=True)[source]¶** – Obtain ESGF credentials from the specified MyProxy service. -If interactive == True then any missing parameters of password, -username or hostname will be prompted for at the terminal. - -Parameters - -interactive – Whether to ask for input at the terminal for -any missing information. I.e. username, password or hostname. -bootstrap – Whether to bootstrap the trustroots for this -MyProxy service. -update_trustroots – Whether to update the trustroots for this -MyProxy service. - -- **Parameters** – interactive – Whether to ask for input at the terminal for -any missing information. I.e. username, password or hostname. -bootstrap – Whether to bootstrap the trustroots for this -MyProxy service. -update_trustroots – Whether to update the trustroots for this -MyProxy service. - - - -- **Parameters** – interactive – Whether to ask for input at the terminal for -any missing information. I.e. username, password or hostname. -bootstrap – Whether to bootstrap the trustroots for this -MyProxy service. -update_trustroots – Whether to update the trustroots for this -MyProxy service. - - - -- **logon_with_openid(openid, password=None, bootstrap=False, update_trustroots=True, interactive=True)[source]¶** – Obtains ESGF credentials by detecting the MyProxy parameters from -the users OpenID. Some ESGF compatible OpenIDs do not contain enough -information to obtain credentials. In this case the user is prompted -for missing information if interactive == True, otherwise an -exception is raised. - -Parameters -openid – OpenID to login with See logon() for parameters -interactive, bootstrap and update_trustroots. - -- **Parameters** – openid – OpenID to login with See logon() for parameters -interactive, bootstrap and update_trustroots. - - - -- **Parameters** – openid – OpenID to login with See logon() for parameters -interactive, bootstrap and update_trustroots. - - - - ---- - - -# Code Examples from Notebooks - - - -## Demo Notebooks - - - -### subset-cmip5.ipynb - -# Subset CMIP5 Datasets with xarray - -xarray: http://xarray.pydata.org/en/stable/index.html - -## Search CMIP5 Dataset with ESGF pyclient - -using: https://esgf-pyclient.readthedocs.io/en/latest/index.html - -```python -from pyesgf.search import SearchConnection -conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True) -``` - -```python -ctx = conn.new_context( - project='CMIP5', - experiment='rcp45', - model='HadCM3', - ensemble='r1i1p1', - time_frequency='mon', - realm='atmos', - data_node='esgf-data1.ceda.ac.uk', - ) -ctx.hit_count -``` - -```python -result = ctx.search()[0] -result.dataset_id -``` - -```python -files = result.file_context().search() -for file in files: - if 'tasmax' in file.opendap_url: - tasmax_url = file.opendap_url - print(tasmax_url) -``` - -## ESGF Logon - -```python -from pyesgf.logon import LogonManager -lm = LogonManager() -lm.logoff() -lm.is_logged_on() -``` - -```python -lm.logon(hostname='esgf-data.dkrz.de', interactive=True, bootstrap=True) -lm.is_logged_on() -``` - -## Subset single dataset with xarray - -Using OpenDAP: http://xarray.pydata.org/en/stable/io.html?highlight=opendap#opendap - -```python -import xarray as xr -ds = xr.open_dataset(tasmax_url, chunks={'time': 120}) -print(ds) -``` - -```python -da = ds['tasmax'] -da = da.isel(time=slice(0, 1)) -da = da.sel(lat=slice(-50, 50), lon=slice(0, 50)) - -``` - -```python -%matplotlib inline -da.plot() -``` - -## Download to NetCDF - -```python -da.to_netcdf('tasmax.nc') -``` - ---- - - -### subset-cmip5.ipynb - -# Subset CMIP5 Datasets with xarray - -xarray: http://xarray.pydata.org/en/stable/index.html - -## Search CMIP5 Dataset with ESGF pyclient - -using: https://esgf-pyclient.readthedocs.io/en/latest/index.html - -```python -from pyesgf.search import SearchConnection -conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True) -``` - -```python -ctx = conn.new_context( - project='CMIP5', - experiment='rcp45', - model='HadCM3', - ensemble='r1i1p1', - time_frequency='mon', - realm='atmos', - data_node='esgf-data1.ceda.ac.uk', - ) -ctx.hit_count -``` - -```python -result = ctx.search()[0] -result.dataset_id -``` - -```python -files = result.file_context().search() -for file in files: - if 'tasmax' in file.opendap_url: - tasmax_url = file.opendap_url - print(tasmax_url) -``` - -## ESGF Logon - -```python -from pyesgf.logon import LogonManager -lm = LogonManager() -lm.logoff() -lm.is_logged_on() -``` - -```python -lm.logon(hostname='esgf-data.dkrz.de', interactive=True, bootstrap=True) -lm.is_logged_on() -``` - -## Subset single dataset with xarray - -Using OpenDAP: http://xarray.pydata.org/en/stable/io.html?highlight=opendap#opendap - -```python -import xarray as xr -ds = xr.open_dataset(tasmax_url, chunks={'time': 120}) -print(ds) -``` - -```python -da = ds['tasmax'] -da = da.isel(time=slice(0, 1)) -da = da.sel(lat=slice(-50, 50), lon=slice(0, 50)) - -``` - -```python -%matplotlib inline -da.plot() -``` - -## Download to NetCDF - -```python -da.to_netcdf('tasmax.nc') -``` - ---- - - -### subset-cmip6.ipynb - -# Subset CMIP6 Datasets with xarray - -xarray: http://xarray.pydata.org/en/stable/index.html - -## Search CMIP6 Dataset with ESGF pyclient - -using: https://esgf-pyclient.readthedocs.io/en/latest/index.html - -```python -from pyesgf.search import SearchConnection -conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True) -``` - -```python -ctx = conn.new_context( - project='CMIP6', - source_id='UKESM1-0-LL', - experiment_id='historical', - variable='tas', - frequency='mon', - variant_label='r1i1p1f2', - data_node='esgf-data3.ceda.ac.uk') -ctx.hit_count -``` - -```python -result = ctx.search()[0] -result.dataset_id -``` - -```python -files = result.file_context().search() -for file in files: - print(file.opendap_url) -``` - -## Subset single dataset with xarray - -Using OpenDAP: http://xarray.pydata.org/en/stable/io.html?highlight=opendap#opendap - -```python -import xarray as xr -ds = xr.open_dataset(files[0].opendap_url, chunks={'time': 120}) -print(ds) -``` - -```python -da = ds['tas'] -da = da.isel(time=slice(0, 1)) -da = da.sel(lat=slice(-50, 50), lon=slice(0, 50)) - -``` - -```python -%matplotlib inline -da.plot() -``` - -## Subset over multiple datasets - - -```python -ds_agg = xr.open_mfdataset([files[0].opendap_url, files[1].opendap_url], chunks={'time': 120}, combine='nested', concat_dim='time') -print(ds_agg) -``` - -```python -da = ds_agg['tas'] -da = da.isel(time=slice(1200, 1201)) -da = da.sel(lat=slice(-50, 50), lon=slice(0, 50)) -``` - -```python -da.plot() -``` - -## Download dataset - -```python -da.to_netcdf('tas_africa_19500116.nc') -``` - ---- - - -### subset-cmip6.ipynb - -# Subset CMIP6 Datasets with xarray - -xarray: http://xarray.pydata.org/en/stable/index.html - -## Search CMIP6 Dataset with ESGF pyclient - -using: https://esgf-pyclient.readthedocs.io/en/latest/index.html - -```python -from pyesgf.search import SearchConnection -conn = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True) -``` - -```python -ctx = conn.new_context( - project='CMIP6', - source_id='UKESM1-0-LL', - experiment_id='historical', - variable='tas', - frequency='mon', - variant_label='r1i1p1f2', - data_node='esgf-data3.ceda.ac.uk') -ctx.hit_count -``` - -```python -result = ctx.search()[0] -result.dataset_id -``` - -```python -files = result.file_context().search() -for file in files: - print(file.opendap_url) -``` - -## Subset single dataset with xarray - -Using OpenDAP: http://xarray.pydata.org/en/stable/io.html?highlight=opendap#opendap - -```python -import xarray as xr -ds = xr.open_dataset(files[0].opendap_url, chunks={'time': 120}) -print(ds) -``` - -```python -da = ds['tas'] -da = da.isel(time=slice(0, 1)) -da = da.sel(lat=slice(-50, 50), lon=slice(0, 50)) - -``` - -```python -%matplotlib inline -da.plot() -``` - -## Subset over multiple datasets - - -```python -ds_agg = xr.open_mfdataset([files[0].opendap_url, files[1].opendap_url], chunks={'time': 120}, combine='nested', concat_dim='time') -print(ds_agg) -``` - -```python -da = ds_agg['tas'] -da = da.isel(time=slice(1200, 1201)) -da = da.sel(lat=slice(-50, 50), lon=slice(0, 50)) -``` - -```python -da.plot() -``` - -## Download dataset - -```python -da.to_netcdf('tas_africa_19500116.nc') -``` - ---- - - -## Examples Notebooks - - - -# General Information about the ESGF API - - -
- -
- -
- -# The ESGF Search RESTful API - -The ESGF search service exposes a RESTful URL that can be used by clients (browsers and desktop clients) to query the contents of the underlying search index, and return results matching the given constraints. Because of the distributed capabilities of the ESGF search, the URL at any Index Node can be used to query that Node only, or all Nodes in the ESGF system. - -
- -## Syntax - -The general syntax of the ESGF search service URL is: - -
- -
- - http:///esg-search/search?[keyword parameters as (name, value) pairs][facet parameters as (name,value) pairs] - -
- -
- -where “” is the base URL of the search service at a given Index Node. - -All parameters (keyword and facet) are optional. Also, the value of all parameters must be URL-encoded, so that the complete search URL is well formed. - -
- -
- -## Keywords - -Keyword parameters are query parameters that have reserved names, and are interpreted by the search service to control the fundamental nature of a search request: where to issue the request to, how many results to return, etc. - -The following keywords are currently used by the system - see later for usage examples: - -- facets= to return facet values and counts - -- offset= , limit= to paginate through the available results (default: offset=0, limit=10) - -- fields= to return only specific metadata fields for each matching result (default: fields=\*) - -- format= to specify the response document output format - -- type= (searches record of the specified type: Dataset, File or Aggregation) - -- replica=false/true (searches for all records, or records that are NOT replicas) - -- latest=true/false (searches for just the latest version, or all versions) - -- distrib=true/false (searches across all nodes, or the target node only) - -- shards= (searches the specified shards only) - -- bbox=\[west, south, east, north\] (searches within a geo-spatial box) - -- start=, end= (select records based on their nominal data coverage, i.e. their datetime_start, datetime_stop values ) - -- from=, to= (select records based on when the data was marked as last modified, i.e. their nominal “timestamp” value) - -
- -
- -## Default Query - -If no parameters at all are specified, the search service will execute a query using all the default values, specifically: - -- query=\* (query all records) - -- distrib=true (execute a distributed search) - -- type=Dataset (return results of type “Dataset”) - -Example: - -- http://esgf-node.llnl.gov/esg-search/search - -
- -
- -## Free Text Queries - -The keyword parameter query= can be specified to execute a query that matches the given text \_ anywhere \_ in the records metadata fields. The parameter value can be any expression following the Apache Lucene query syntax (because it is passed “as-is” to the back-end Solr query), and must be URL- encoded. When using the CoG user interface at any ESGF node and project, the “query=” parameter value must be entered in the text field at the top of the page. - -Examples: - -- Search for any text, anywhere: http://esgf-node.llnl.gov/esg-search/search?query=* (the default value of the query parameter) - -- Search for “humidity” in all metadata fields: http://esgf-node.llnl.gov/esg-search/search?query=humidity - -- Search for the exact sentence “specific humidity” in all metadata fields (the sentence must be surrounded by quotes and URL-encoded): http://esgf-node.llnl.gov/esg-search/search?query=%22specific%20humidity%22 - -- Search for both words “specific” and “humidity”, but not necessarily in an exact sequence (must use a space between the two words = this is the same as executing a query with the logical OR): http://esgf-node.llnl.gov/esg-search/search?query=specific%20humidity - -- Search for the word “observations” ONLY in the metadata field “product” : http://esgf-node.llnl.gov/esg-search/search?query=product:observations - -- Using logical AND: http://esgf-node.llnl.gov/esg-search/search?query=airs%20AND%20humidity (must use upper case “AND”) - -- Using logical OR: http://esgf-node.llnl.gov/esg-search/search?query=airs%20OR%20humidity (must use upper case “OR”). This is the same as using simply a blank space: http://esgf-node.llnl.gov/esg-search/search?query=airs%20humidity ) - -- Search for a dataset with a specific id: http://esgf-node.llnl.gov/esg-search/search?query=id:obs4MIPs.NASA-JPL.AIRS.hus.mon.v20110608\|esgf-data.llnl.gov - -- Search for all datasets that match an id pattern: http://esgf-node.llnl.gov/esg-search/search?query=id:obs4MIPs.NASA-JPL.AIRS.* - -
- -
- -## Facet Queries - -A request to the search service can be constrained to return only those records that match specific values for one or more facets. Specifically, a facet constraint is expressed through the general form: =, where is chosen from the controlled vocabulary of facet names configured at each site, and must match exactly one of the possible values for that particular facet. - -When specifying more than one facet constraint in the request, multiple values for the same facet are combined with a logical OR, while multiple values for different facets are combined with a logical AND. Also, multiple possible values for teh same facets can be expressed as a comma-separated list. For example: - -- experiment=decadal2000&variable=hus : will return all records that match experiment=decadal2000 AND variable=hus - -- variable=hus&variable=ta : will return all records that match variable=hus OR variable=ta - -- variable=hus,ta : will also return all records that match variable=hus OR variable=ta - -A facet constraint can be negated by using the != operator. For example, model!=CCSM searches for all items that do NOT match the CCSM model. Note that all negative facets are combined in logical AND, for example, model!=CCSM&model!=HadCAM searches for all items that do not match CCSM, and do not match HadCAM. - -By default, no facet counts are returned in the output document. Facet counts must be explicitly requested by specifying the facet names individually (for example: facets=experiment,model) or via the special notation facets=\*. The facets list must be comma-separated, and white spaces are ignored. - -If facet counts is requested, facet values are sorted alphabetically (facet.sort=lex), and all facet values are returned (facet.limit=-1), provided they match one or more records (facet.mincount=1) - -The “type” facet must be always specified as part of any request to the ESGF search services, so that the appropriate records can be searched and returned. If not specified explicitly, the default value is type=Dataset . - -Examples: - -- Single facet query: http://esgf-node.llnl.gov/esg-search/search?cf_standard_name=air_temperature - -- Query with two different facet constraints: http://esgf-node.llnl.gov/esg-search/search?cf_standard_name=air_temperature&project=obs4MIPs - -- Combining two values of the same facet with a logical OR: http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&variable=hus&variable=ta (search for all obs4MIPs files that have variable “ta” OR variable “hus”) - -- Using a negative facet: - - - http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&variable=hus&variable=ta&model!=Obs-AIRS (search for all obs4MIPs datasets that have variable ta OR hus, excluding those produced by AIRS) - - - http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&variable!=ta&variable!=huss (search for all obs4MIPs datasets that do not contain neither variable ta nor variable huss) - -- Search a file by its tracking id: http://esgf-node.llnl.gov/esg-search/search?type=File&tracking_id=2209a0d0-9b77-4ecb-b2ab-b7ae412e7a3f - -- Search a file by its checksum: http://esgf-node.llnl.gov/esg-search/search?type=File&checksum=83df8ae93e85e26df797d5f770449470987a4ecd8f2d405159995b5cac9a410c - -- Issue a query for all supported facets and their values at one site, while returning no results (note that only facets with one or more values are returned): http://esgf-node.llnl.gov/esg-search/search?facets=*&limit=0&distrib=false - -
- -
- -## Facet Listings - -The available facet names and values for searching data within a specific project can be listed with a query of the form …project=&facets=\*&limit=0 (i.e. return no results). Only facet values that match one or more records will be returned. - -Examples: - -- List all obs4MIPs facet names and values: http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&facets=*&limit=0 - -- List all CMIP5 facet names and values: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&facets=*&limit=0 - -The same query with no project constraint will return all facet names and values for ALL data across the federation: - -- List ALL facet names and values: http://esgf-node.llnl.gov/esg-search/search?facets=*&limit=0 - -To retrieve a listing of available values for only a few facets, simply specify a comma-separated list of facet names: - -- List all values of model, experiment and project throughout the federation: http://esgf-node.llnl.gov/esg-search/search?facets=model,experiment,project&limit=0 - -- List all values of model, experiment for CMIP5 data: http://esgf-node.llnl.gov/esg-search/search?facets=model,experiment&project=CMIP5&limit=0 - -
- -
- -## Temporal Coverage Queries - -The keyword parameters start= and/or end= can be used to query for data with temporal coverage that overlaps the specified range. The parameter values can either be date-times in the format “YYYY-MM-DDTHH:MM:SSZ” (UTC ISO 8601 format), or special values supported by the Solr DateMath syntax. - -Examples: - -- Search for data in the past year: http://esgf-node.llnl.gov/esg-search/search?start=NOW-1YEAR (translates into the constraint datetime_stop:\[NOW-1YEAR TO \*\] or datetime_stop \> NOW-1YEAR) - -- Search for data before the year 2000: http://esgf-node.llnl.gov/esg-search/search?end=2000-01-01T00:00:00Z (translates into the constraint datetime_start:\[\* TO 2000-01-01T00:00:00Z\] or datetime_start \< 2000-01-01) - -
- -
- -## Spatial Coverage Queries - -The keyword parameter bbox=\[west, south, east, north\] can be used to query for data with spatial coverage that overlaps the given bounding box. As usual, the parameter value must be URL-encoded. - -Examples: - -- http://esgf-node.llnl.gov/esg-search/search?bbox=%5B-10,-10,+10,+10%5D ( translates to: east_degrees:\[-10 TO \*\] AND north_degrees:\[-10 TO \*\] AND west_degrees:\[\* TO 10\] AND south_degrees:\[\* TO 10\] ) - -Please note though that NOT all ESGF records contain geo-spatial information, and therefore will not be returned by a geo-spatial search. - -
- -
- -## Distributed Queries - -The keyword parameter distrib= can be used to control whether the query is executed versus the local Index Node only, or distributed to all other Nodes in the federation. If not specified, the default value distrib=true is assumed. - -Examples: - -- Search for all datasets in the federation: http://esgf-node.llnl.gov/esg-search/search?distrib=true - -- Search for all datasets at one Node only: http://esgf-node.llnl.gov/esg-search/search?distrib=false - -
- -
- -## Shard Queries - -By default, a distributed query (distrib=true) targets all ESGF Nodes in the current peer group, i.e. all nodes that are listed in the local configuration file /esg/config/esgf_shards.xml , which is continuously updated by the local node manager to reflect the latest state of the federation. It is possible to execute a distributed search that targets only one or more specific nodes, by specifying them in the “shards” parameter, as such: shards=hostname1:port1/solr,hostname2:port2/solr,…. . Note that the explicit shards value is ignored if distrib=false (but distrib=true by default if not otherwise specified). - -Examples: - -- Query for CMIP5 data at the PCMDI and CEDA sites only: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&shards=pcmdi.llnl.gov/solr,esgf-index1.ceda.ac.uk/solr - -- Query for all files belonging to a given dataset at one site only: http://esgf-node.llnl.gov/esg-search/search?type=File&shards=esgf-node.llnl.gov/solr&dataset_id=obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|esgf-data.llnl.gov - -
- -
- -## Replica Queries - -Replicas (Datasets and Files) are distinguished from the original record (a.k.a. the “master”) in the Solr index by the value of two special keywords: - -- replica: a flag that is set to false for master records, true for replica records. - -- master_id: a string that is identical for the master and all replicas of a given logical record (Dataset or File). - -By default, a query returns all records (masters and replicas) matching the search criteria, i.e. no replica=… constraint is used. To return only master records, use replica=false, to return only replicas, use replica=true. To search for all identical Datasets or Files (i.e. for the master AND replicas of a Dataset or File), use master_id=…. - -Examples: - -- Search for all datasets in the system (masters and replicas): http://esgf-node.llnl.gov/esg-search/search - -- Search for just master datasets, no replicas: http://esgf-node.llnl.gov/esg-search/search?replica=false - -- Search for just replica datasets, no masters: http://esgf-node.llnl.gov/esg-search/search?replica=true - -- Search for the master AND replicas of a given dataset: http://esgf-node.llnl.gov/esg-search/search?master_id=cmip5.output1.LASG-CESS.FGOALS-g2.midHolocene.3hr.land.3hr.r1i1p1 - -- Search for the master and replicas of a given file: http://esgf-node.llnl.gov/esg-search/search?type=File&master_id=cmip5.output1.MIROC.MIROC5.decadal1978.mon.ocean.Omon.r4i1p1.wfo_Omon_MIROC5_decadal1978_r4i1p1_197901-198812.nc - -
- -
- -## Latest and Version Queries - -By default, a query to the ESGF search services will return all versions of the matching records (Datasets or Files). To only return the very last, up-to-date version include latest=true . To return a specific version, use version=… . Using latest=false will return only datasets that were superseded by newer versions. - -Examples: - -- Search for all latest CMIP5 datasets: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&latest=true - -- Search for all versions of a given dataset: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&master_id=cmip5.output1.MOHC.HadCM3.decadal1972.day.atmos.day.r10i2p1&facets=version - -- Search for a specific version of a given dataset: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&master_id=cmip5.output1.NSF-DOE-NCAR.CESM1-CAM5-1-FV2.historical.mon.atmos.Amon.r1i1p1&version=20120712 - -
- -
- -## Retracted Queries - -NOTE: this feature is NOT yet released - -Retracted datasets are marked by “retracted=true”, and also have the flag “latest=false” set. Consequently, retracted datasets are automatically NOT included in any search for the latest version data (“latest=true”), while they are automatically included in searches the span all versions (no “latest” constraint). To search specifically for only retracted datasets, use the constraint “retracted=true”. - -Example: - -- Search for all retracted datasets in the CMIP5 project, across all nodes: https://esgf-node.llnl.gov/esg-search/search?project=CMIP5&retracted=true - -
- -
- -## Minimum and Maximum Version Queries - -NOTE: this feature is NOT yet released - -The special keywords “min_version” and “max_version” can be used to query for all records that have a version greater or equal, or less or equal, of a given numerical value. Because often in ESGF versions are expressed as dates of the format YYYYMMDD, it is possible to query for all records that have a version greater/less or equal of a certain date. The two constraints can be combined with each other to specify a version (aka date) range, and can also be combined with other constraints. - -Examples: - -- All datasets with version less than a given date: https://esgf-node.llnl.gov/esg-search/search?max_version=20150101 - -- All Obs4MIPs datasets with version between two dates: http://esgf-node.llnl.gov/esg-search/search?min_version=20120101&max_version=20131231&project=obs4MIPs - -
- -
- -## Results Pagination - -By default, a query to the search service will return the first 10 records matching the given constraints. The offset into the returned results, and the total number of returned results, can be changed through the keyword parameters limit= and offset= . The system imposes a maximum value of limit \<= 10,000. - -Examples: - -- Query for 100 CMIP5 datasets in the system: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&limit=100 - -- Query for the next 100 CMIP5 datasets in the system: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&limit=100&offset=100 - -
- -
- -## Output Format - -The keyword parameter output= can be used to request results in a specific output format. Currently the only available options are Solr/XML (the default) and Solr/JSON. - -Examples: - -- Request results in Solr XML format: http://esgf-node.llnl.gov/esg-search/search?format=application%2Fsolr%2Bxml - -- Request results in Solr JSON format: http://esgf-node.llnl.gov/esg-search/search?format=application%2Fsolr%2Bjson - -
- -
- -## Returned Metadata Fields - -By default, all available metadata fields are returned for each result. The keyword parameter fields= can be used to limit the number of fields returned in the response document, for each matching result. The list must be comma-separated, and white spaces are ignored. Use fields=\* to return all fields (same as not specifiying it, since it is the default). Note that the pseudo field “score” is always appended to any fields list. - -Examples: - -- Return all available metadata fields for CMIP5 datasets: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&fields=* - -- Return only the “model” and “experiment” fields for CMIP5 datasets: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&fields=model,experiment - -
- -
- -## Identifiers - -Each search record in the system is assigned the following identifiers (all of type string): - -- id : universally unique for each record across the federation, i.e. specific to each Dataset or File, version and replica (and the data node storing the data). It is intended to be “opaque”, i.e. it should not be parsed by clients to extract any information. - - - Dataset example: id=obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|esgf-data.llnl.gov - - - File example: id=obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608.tro3Stderr_TES_L3_tbd_200507-200912.nc\|esgf-data.llnl.gov - -- master_id : same for all replicas and versions across the federation. When parsing THREDDS catalogs, it is extracted from the properties “dataset_id” or “file_id”. - - - Dataset example: obs4MIPs.NASA-JPL.TES.tro3.mon (for a Dataset) - - - File example: obs4MIPs.NASA-JPL.TES.tro3.mon.tro3Stderr_TES_L3_tbd_200507-200912.nc - -- instance_id : same for all replicas across federation, but specific to each version. When parsing THREDDS catalogs, it is extracted from the ID attribute of the corresponding THREDDS catalog element (for both Datasets and Files). - - - Dataset example: obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608 - - - File example: obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608.tro3Stderr_TES_L3_tbd_200507-200912.nc - -Note also that the record version is the same for all replicas of that record, but different across versions. Examples: - -- Dataset example: version=20110608 - -- File example: version=1 - -
- -
- -## Access URLs - -In the Solr output document returned by a search, URLs that are access points for Datasets and Files are encoded as 3-tuple of the form “url\|mime type\|service name”, where the fields are separated by the “pipe (”\|“) character, and the”mime type” and “service name” are chosen from the ESGF controlled vocabulary. - -Example of Dataset access URLs: - -- THREDDS catalog: http://esgf-data.llnl.gov/thredds/catalog/esgcet/1/obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608.xml#obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|application/xml+thredds\|THREDDS - -- LAS server: http://esgf-node.llnl.gov/las/getUI.do?catid=0C5410C250379F2D139F978F7BF48BB9_ns_obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|application/las\|LAS - -Example of File access URLs: - -- HTTP download: http://esgf-data.llnl.gov/thredds/fileServer/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc\|application/netcdf\|HTTPServer - -- GridFTP download: gsiftp://esgf-data.llnl.gov:2811//esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc\|application/gridftp\|GridFTP - -- OpenDAP download: http://esgf-data.llnl.gov/thredds/dodsC/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc.html\|application/opendap-html\|OPENDAP - -- Globus As-A-Service download: globus:e3f6216e-063e-11e6-a732-22000bf2d559/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc\|Globus\|Globus - -
- -
- -## Wget scripting - -The same RESTful API that is used to query the ESGF search services can also be used, with minor modifications, to generate a Wget script to download all files matching the given constraints. Specifically, each ESGF Index Node exposes the following URL for generating Wget scripts: - -
- -
- - http:///wget?[keyword parameters as (name, value) pairs][facet parameters as (name,value) pairs] - -
- -
- -where again“” is the base URL of the search service at a given Index Node. As for searching, all parameters (keyword and facet) are optional, and the value of all parameters must be URL-encoded, so that the complete search URL is well formed. - -The only syntax differences with respect to the search URL are: - -- The keyword parameter type= is not allowed, as the wget URL always assumes type=File . - -- The keyword parameter format= is not allowed, as the wget URL always returns a shell script as response document. - -- The keyword parameter limit= is assigned a default value of limit=1000 (and must still be limit \< 10,000). - -- The keyword parameter download_structure= is used for defining a relative directory structure for the download by using the facets value (i.e. of Files and not Datasets). - -- The keyword parameter download_emptypath= is used to define what to do when download_structure is set and the facet returned has no value (for example, when mixing files from CMIP5 and obs4MIP and selecting instrument as a facet value will result in all CMIP5 files returning an empty value) - -A typical workflow pattern consists in first identifying all datasets or files matching some scientific criteria, then changing the request URL from “/search?” to “/wget?” to generate the corresponding shell scripts for bulk download of files. - -Examples: - -- Download all obs4MIPs files from the JPL node with variable “hus” : http://esgf-node.llnl.gov/esg-search/wget?variable=hus&project=obs4MIPs&distrib=false - -- Download the files as in the previous examples, and organize them in a directory structure such as project/product/institute/time_frequency : http://esgf-node.llnl.gov/esg-search/wget?variable=hus&project=obs4MIPs&distrib=false&download_structure=project,product,institute,time_frequency - -For more information, see also the Wget FAQ - -
- -
- -
- -
diff --git a/src/beaker_climate/climate_python/api_documentation/esgf_rest_documentation.md b/src/beaker_climate/climate_python/api_documentation/esgf_rest_documentation.md deleted file mode 100644 index 09b5d2d..0000000 --- a/src/beaker_climate/climate_python/api_documentation/esgf_rest_documentation.md +++ /dev/null @@ -1,455 +0,0 @@ -# ESGF REST Documentation - -
- -
- -
- -# The ESGF Search RESTful API - -The ESGF search service exposes a RESTful URL that can be used by clients (browsers and desktop clients) to query the contents of the underlying search index, and return results matching the given constraints. Because of the distributed capabilities of the ESGF search, the URL at any Index Node can be used to query that Node only, or all Nodes in the ESGF system. - -
- -## Syntax - -The general syntax of the ESGF search service URL is: - -
- -
- - http:///esg-search/search?[keyword parameters as (name, value) pairs][facet parameters as (name,value) pairs] - -
- -
- -where “” is the base URL of the search service at a given Index Node. - -All parameters (keyword and facet) are optional. Also, the value of all parameters must be URL-encoded, so that the complete search URL is well formed. - -
- -
- -## Keywords - -Keyword parameters are query parameters that have reserved names, and are interpreted by the search service to control the fundamental nature of a search request: where to issue the request to, how many results to return, etc. - -The following keywords are currently used by the system - see later for usage examples: - -- facets= to return facet values and counts - -- offset= , limit= to paginate through the available results (default: offset=0, limit=10) - -- fields= to return only specific metadata fields for each matching result (default: fields=\*) - -- format= to specify the response document output format - -- type= (searches record of the specified type: Dataset, File or Aggregation) - -- replica=false/true (searches for all records, or records that are NOT replicas) - -- latest=true/false (searches for just the latest version, or all versions) - -- distrib=true/false (searches across all nodes, or the target node only) - -- shards= (searches the specified shards only) - -- bbox=\[west, south, east, north\] (searches within a geo-spatial box) - -- start=, end= (select records based on their nominal data coverage, i.e. their datetime_start, datetime_stop values ) - -- from=, to= (select records based on when the data was marked as last modified, i.e. their nominal “timestamp” value) - -
- -
- -## Default Query - -If no parameters at all are specified, the search service will execute a query using all the default values, specifically: - -- query=\* (query all records) - -- distrib=true (execute a distributed search) - -- type=Dataset (return results of type “Dataset”) - -Example: - -- http://esgf-node.llnl.gov/esg-search/search - -
- -
- -## Free Text Queries - -The keyword parameter query= can be specified to execute a query that matches the given text \_ anywhere \_ in the records metadata fields. The parameter value can be any expression following the Apache Lucene query syntax (because it is passed “as-is” to the back-end Solr query), and must be URL- encoded. When using the CoG user interface at any ESGF node and project, the “query=” parameter value must be entered in the text field at the top of the page. - -Examples: - -- Search for any text, anywhere: http://esgf-node.llnl.gov/esg-search/search?query=* (the default value of the query parameter) - -- Search for “humidity” in all metadata fields: http://esgf-node.llnl.gov/esg-search/search?query=humidity - -- Search for the exact sentence “specific humidity” in all metadata fields (the sentence must be surrounded by quotes and URL-encoded): http://esgf-node.llnl.gov/esg-search/search?query=%22specific%20humidity%22 - -- Search for both words “specific” and “humidity”, but not necessarily in an exact sequence (must use a space between the two words = this is the same as executing a query with the logical OR): http://esgf-node.llnl.gov/esg-search/search?query=specific%20humidity - -- Search for the word “observations” ONLY in the metadata field “product” : http://esgf-node.llnl.gov/esg-search/search?query=product:observations - -- Using logical AND: http://esgf-node.llnl.gov/esg-search/search?query=airs%20AND%20humidity (must use upper case “AND”) - -- Using logical OR: http://esgf-node.llnl.gov/esg-search/search?query=airs%20OR%20humidity (must use upper case “OR”). This is the same as using simply a blank space: http://esgf-node.llnl.gov/esg-search/search?query=airs%20humidity ) - -- Search for a dataset with a specific id: http://esgf-node.llnl.gov/esg-search/search?query=id:obs4MIPs.NASA-JPL.AIRS.hus.mon.v20110608\|esgf-data.llnl.gov - -- Search for all datasets that match an id pattern: http://esgf-node.llnl.gov/esg-search/search?query=id:obs4MIPs.NASA-JPL.AIRS.* - -
- -
- -## Facet Queries - -A request to the search service can be constrained to return only those records that match specific values for one or more facets. Specifically, a facet constraint is expressed through the general form: =, where is chosen from the controlled vocabulary of facet names configured at each site, and must match exactly one of the possible values for that particular facet. - -When specifying more than one facet constraint in the request, multiple values for the same facet are combined with a logical OR, while multiple values for different facets are combined with a logical AND. Also, multiple possible values for teh same facets can be expressed as a comma-separated list. For example: - -- experiment=decadal2000&variable=hus : will return all records that match experiment=decadal2000 AND variable=hus - -- variable=hus&variable=ta : will return all records that match variable=hus OR variable=ta - -- variable=hus,ta : will also return all records that match variable=hus OR variable=ta - -A facet constraint can be negated by using the != operator. For example, model!=CCSM searches for all items that do NOT match the CCSM model. Note that all negative facets are combined in logical AND, for example, model!=CCSM&model!=HadCAM searches for all items that do not match CCSM, and do not match HadCAM. - -By default, no facet counts are returned in the output document. Facet counts must be explicitly requested by specifying the facet names individually (for example: facets=experiment,model) or via the special notation facets=\*. The facets list must be comma-separated, and white spaces are ignored. - -If facet counts is requested, facet values are sorted alphabetically (facet.sort=lex), and all facet values are returned (facet.limit=-1), provided they match one or more records (facet.mincount=1) - -The “type” facet must be always specified as part of any request to the ESGF search services, so that the appropriate records can be searched and returned. If not specified explicitly, the default value is type=Dataset . - -Examples: - -- Single facet query: http://esgf-node.llnl.gov/esg-search/search?cf_standard_name=air_temperature - -- Query with two different facet constraints: http://esgf-node.llnl.gov/esg-search/search?cf_standard_name=air_temperature&project=obs4MIPs - -- Combining two values of the same facet with a logical OR: http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&variable=hus&variable=ta (search for all obs4MIPs files that have variable “ta” OR variable “hus”) - -- Using a negative facet: - - - http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&variable=hus&variable=ta&model!=Obs-AIRS (search for all obs4MIPs datasets that have variable ta OR hus, excluding those produced by AIRS) - - - http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&variable!=ta&variable!=huss (search for all obs4MIPs datasets that do not contain neither variable ta nor variable huss) - -- Search a file by its tracking id: http://esgf-node.llnl.gov/esg-search/search?type=File&tracking_id=2209a0d0-9b77-4ecb-b2ab-b7ae412e7a3f - -- Search a file by its checksum: http://esgf-node.llnl.gov/esg-search/search?type=File&checksum=83df8ae93e85e26df797d5f770449470987a4ecd8f2d405159995b5cac9a410c - -- Issue a query for all supported facets and their values at one site, while returning no results (note that only facets with one or more values are returned): http://esgf-node.llnl.gov/esg-search/search?facets=*&limit=0&distrib=false - -
- -
- -## Facet Listings - -The available facet names and values for searching data within a specific project can be listed with a query of the form …project=&facets=\*&limit=0 (i.e. return no results). Only facet values that match one or more records will be returned. - -Examples: - -- List all obs4MIPs facet names and values: http://esgf-node.llnl.gov/esg-search/search?project=obs4MIPs&facets=*&limit=0 - -- List all CMIP5 facet names and values: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&facets=*&limit=0 - -The same query with no project constraint will return all facet names and values for ALL data across the federation: - -- List ALL facet names and values: http://esgf-node.llnl.gov/esg-search/search?facets=*&limit=0 - -To retrieve a listing of available values for only a few facets, simply specify a comma-separated list of facet names: - -- List all values of model, experiment and project throughout the federation: http://esgf-node.llnl.gov/esg-search/search?facets=model,experiment,project&limit=0 - -- List all values of model, experiment for CMIP5 data: http://esgf-node.llnl.gov/esg-search/search?facets=model,experiment&project=CMIP5&limit=0 - -
- -
- -## Temporal Coverage Queries - -The keyword parameters start= and/or end= can be used to query for data with temporal coverage that overlaps the specified range. The parameter values can either be date-times in the format “YYYY-MM-DDTHH:MM:SSZ” (UTC ISO 8601 format), or special values supported by the Solr DateMath syntax. - -Examples: - -- Search for data in the past year: http://esgf-node.llnl.gov/esg-search/search?start=NOW-1YEAR (translates into the constraint datetime_stop:\[NOW-1YEAR TO \*\] or datetime_stop \> NOW-1YEAR) - -- Search for data before the year 2000: http://esgf-node.llnl.gov/esg-search/search?end=2000-01-01T00:00:00Z (translates into the constraint datetime_start:\[\* TO 2000-01-01T00:00:00Z\] or datetime_start \< 2000-01-01) - -
- -
- -## Spatial Coverage Queries - -The keyword parameter bbox=\[west, south, east, north\] can be used to query for data with spatial coverage that overlaps the given bounding box. As usual, the parameter value must be URL-encoded. - -Examples: - -- http://esgf-node.llnl.gov/esg-search/search?bbox=%5B-10,-10,+10,+10%5D ( translates to: east_degrees:\[-10 TO \*\] AND north_degrees:\[-10 TO \*\] AND west_degrees:\[\* TO 10\] AND south_degrees:\[\* TO 10\] ) - -Please note though that NOT all ESGF records contain geo-spatial information, and therefore will not be returned by a geo-spatial search. - -
- -
- -## Distributed Queries - -The keyword parameter distrib= can be used to control whether the query is executed versus the local Index Node only, or distributed to all other Nodes in the federation. If not specified, the default value distrib=true is assumed. - -Examples: - -- Search for all datasets in the federation: http://esgf-node.llnl.gov/esg-search/search?distrib=true - -- Search for all datasets at one Node only: http://esgf-node.llnl.gov/esg-search/search?distrib=false - -
- -
- -## Shard Queries - -By default, a distributed query (distrib=true) targets all ESGF Nodes in the current peer group, i.e. all nodes that are listed in the local configuration file /esg/config/esgf_shards.xml , which is continuously updated by the local node manager to reflect the latest state of the federation. It is possible to execute a distributed search that targets only one or more specific nodes, by specifying them in the “shards” parameter, as such: shards=hostname1:port1/solr,hostname2:port2/solr,…. . Note that the explicit shards value is ignored if distrib=false (but distrib=true by default if not otherwise specified). - -Examples: - -- Query for CMIP5 data at the PCMDI and CEDA sites only: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&shards=pcmdi.llnl.gov/solr,esgf-index1.ceda.ac.uk/solr - -- Query for all files belonging to a given dataset at one site only: http://esgf-node.llnl.gov/esg-search/search?type=File&shards=esgf-node.llnl.gov/solr&dataset_id=obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|esgf-data.llnl.gov - -
- -
- -## Replica Queries - -Replicas (Datasets and Files) are distinguished from the original record (a.k.a. the “master”) in the Solr index by the value of two special keywords: - -- replica: a flag that is set to false for master records, true for replica records. - -- master_id: a string that is identical for the master and all replicas of a given logical record (Dataset or File). - -By default, a query returns all records (masters and replicas) matching the search criteria, i.e. no replica=… constraint is used. To return only master records, use replica=false, to return only replicas, use replica=true. To search for all identical Datasets or Files (i.e. for the master AND replicas of a Dataset or File), use master_id=…. - -Examples: - -- Search for all datasets in the system (masters and replicas): http://esgf-node.llnl.gov/esg-search/search - -- Search for just master datasets, no replicas: http://esgf-node.llnl.gov/esg-search/search?replica=false - -- Search for just replica datasets, no masters: http://esgf-node.llnl.gov/esg-search/search?replica=true - -- Search for the master AND replicas of a given dataset: http://esgf-node.llnl.gov/esg-search/search?master_id=cmip5.output1.LASG-CESS.FGOALS-g2.midHolocene.3hr.land.3hr.r1i1p1 - -- Search for the master and replicas of a given file: http://esgf-node.llnl.gov/esg-search/search?type=File&master_id=cmip5.output1.MIROC.MIROC5.decadal1978.mon.ocean.Omon.r4i1p1.wfo_Omon_MIROC5_decadal1978_r4i1p1_197901-198812.nc - -
- -
- -## Latest and Version Queries - -By default, a query to the ESGF search services will return all versions of the matching records (Datasets or Files). To only return the very last, up-to-date version include latest=true . To return a specific version, use version=… . Using latest=false will return only datasets that were superseded by newer versions. - -Examples: - -- Search for all latest CMIP5 datasets: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&latest=true - -- Search for all versions of a given dataset: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&master_id=cmip5.output1.MOHC.HadCM3.decadal1972.day.atmos.day.r10i2p1&facets=version - -- Search for a specific version of a given dataset: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&master_id=cmip5.output1.NSF-DOE-NCAR.CESM1-CAM5-1-FV2.historical.mon.atmos.Amon.r1i1p1&version=20120712 - -
- -
- -## Retracted Queries - -NOTE: this feature is NOT yet released - -Retracted datasets are marked by “retracted=true”, and also have the flag “latest=false” set. Consequently, retracted datasets are automatically NOT included in any search for the latest version data (“latest=true”), while they are automatically included in searches the span all versions (no “latest” constraint). To search specifically for only retracted datasets, use the constraint “retracted=true”. - -Example: - -- Search for all retracted datasets in the CMIP5 project, across all nodes: https://esgf-node.llnl.gov/esg-search/search?project=CMIP5&retracted=true - -
- -
- -## Minimum and Maximum Version Queries - -NOTE: this feature is NOT yet released - -The special keywords “min_version” and “max_version” can be used to query for all records that have a version greater or equal, or less or equal, of a given numerical value. Because often in ESGF versions are expressed as dates of the format YYYYMMDD, it is possible to query for all records that have a version greater/less or equal of a certain date. The two constraints can be combined with each other to specify a version (aka date) range, and can also be combined with other constraints. - -Examples: - -- All datasets with version less than a given date: https://esgf-node.llnl.gov/esg-search/search?max_version=20150101 - -- All Obs4MIPs datasets with version between two dates: http://esgf-node.llnl.gov/esg-search/search?min_version=20120101&max_version=20131231&project=obs4MIPs - -
- -
- -## Results Pagination - -By default, a query to the search service will return the first 10 records matching the given constraints. The offset into the returned results, and the total number of returned results, can be changed through the keyword parameters limit= and offset= . The system imposes a maximum value of limit \<= 10,000. - -Examples: - -- Query for 100 CMIP5 datasets in the system: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&limit=100 - -- Query for the next 100 CMIP5 datasets in the system: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&limit=100&offset=100 - -
- -
- -## Output Format - -The keyword parameter output= can be used to request results in a specific output format. Currently the only available options are Solr/XML (the default) and Solr/JSON. - -Examples: - -- Request results in Solr XML format: http://esgf-node.llnl.gov/esg-search/search?format=application%2Fsolr%2Bxml - -- Request results in Solr JSON format: http://esgf-node.llnl.gov/esg-search/search?format=application%2Fsolr%2Bjson - -
- -
- -## Returned Metadata Fields - -By default, all available metadata fields are returned for each result. The keyword parameter fields= can be used to limit the number of fields returned in the response document, for each matching result. The list must be comma-separated, and white spaces are ignored. Use fields=\* to return all fields (same as not specifiying it, since it is the default). Note that the pseudo field “score” is always appended to any fields list. - -Examples: - -- Return all available metadata fields for CMIP5 datasets: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&fields=* - -- Return only the “model” and “experiment” fields for CMIP5 datasets: http://esgf-node.llnl.gov/esg-search/search?project=CMIP5&fields=model,experiment - -
- -
- -## Identifiers - -Each search record in the system is assigned the following identifiers (all of type string): - -- id : universally unique for each record across the federation, i.e. specific to each Dataset or File, version and replica (and the data node storing the data). It is intended to be “opaque”, i.e. it should not be parsed by clients to extract any information. - - - Dataset example: id=obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|esgf-data.llnl.gov - - - File example: id=obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608.tro3Stderr_TES_L3_tbd_200507-200912.nc\|esgf-data.llnl.gov - -- master_id : same for all replicas and versions across the federation. When parsing THREDDS catalogs, it is extracted from the properties “dataset_id” or “file_id”. - - - Dataset example: obs4MIPs.NASA-JPL.TES.tro3.mon (for a Dataset) - - - File example: obs4MIPs.NASA-JPL.TES.tro3.mon.tro3Stderr_TES_L3_tbd_200507-200912.nc - -- instance_id : same for all replicas across federation, but specific to each version. When parsing THREDDS catalogs, it is extracted from the ID attribute of the corresponding THREDDS catalog element (for both Datasets and Files). - - - Dataset example: obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608 - - - File example: obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608.tro3Stderr_TES_L3_tbd_200507-200912.nc - -Note also that the record version is the same for all replicas of that record, but different across versions. Examples: - -- Dataset example: version=20110608 - -- File example: version=1 - -
- -
- -## Access URLs - -In the Solr output document returned by a search, URLs that are access points for Datasets and Files are encoded as 3-tuple of the form “url\|mime type\|service name”, where the fields are separated by the “pipe (”\|“) character, and the”mime type” and “service name” are chosen from the ESGF controlled vocabulary. - -Example of Dataset access URLs: - -- THREDDS catalog: http://esgf-data.llnl.gov/thredds/catalog/esgcet/1/obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608.xml#obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|application/xml+thredds\|THREDDS - -- LAS server: http://esgf-node.llnl.gov/las/getUI.do?catid=0C5410C250379F2D139F978F7BF48BB9_ns_obs4MIPs.NASA-JPL.TES.tro3.mon.v20110608\|application/las\|LAS - -Example of File access URLs: - -- HTTP download: http://esgf-data.llnl.gov/thredds/fileServer/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc\|application/netcdf\|HTTPServer - -- GridFTP download: gsiftp://esgf-data.llnl.gov:2811//esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc\|application/gridftp\|GridFTP - -- OpenDAP download: http://esgf-data.llnl.gov/thredds/dodsC/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc.html\|application/opendap-html\|OPENDAP - -- Globus As-A-Service download: globus:e3f6216e-063e-11e6-a732-22000bf2d559/esg_dataroot/obs4MIPs/observations/atmos/tro3Stderr/mon/grid/NASA-JPL/TES/v20110608/tro3Stderr_TES_L3_tbd_200507-200912.nc\|Globus\|Globus - -
- -
- -## Wget scripting - -The same RESTful API that is used to query the ESGF search services can also be used, with minor modifications, to generate a Wget script to download all files matching the given constraints. Specifically, each ESGF Index Node exposes the following URL for generating Wget scripts: - -
- -
- - http:///wget?[keyword parameters as (name, value) pairs][facet parameters as (name,value) pairs] - -
- -
- -where again“” is the base URL of the search service at a given Index Node. As for searching, all parameters (keyword and facet) are optional, and the value of all parameters must be URL-encoded, so that the complete search URL is well formed. - -The only syntax differences with respect to the search URL are: - -- The keyword parameter type= is not allowed, as the wget URL always assumes type=File . - -- The keyword parameter format= is not allowed, as the wget URL always returns a shell script as response document. - -- The keyword parameter limit= is assigned a default value of limit=1000 (and must still be limit \< 10,000). - -- The keyword parameter download_structure= is used for defining a relative directory structure for the download by using the facets value (i.e. of Files and not Datasets). - -- The keyword parameter download_emptypath= is used to define what to do when download_structure is set and the facet returned has no value (for example, when mixing files from CMIP5 and obs4MIP and selecting instrument as a facet value will result in all CMIP5 files returning an empty value) - -A typical workflow pattern consists in first identifying all datasets or files matching some scientific criteria, then changing the request URL from “/search?” to “/wget?” to generate the corresponding shell scripts for bulk download of files. - -Examples: - -- Download all obs4MIPs files from the JPL node with variable “hus” : http://esgf-node.llnl.gov/esg-search/wget?variable=hus&project=obs4MIPs&distrib=false - -- Download the files as in the previous examples, and organize them in a directory structure such as project/product/institute/time_frequency : http://esgf-node.llnl.gov/esg-search/wget?variable=hus&project=obs4MIPs&distrib=false&download_structure=project,product,institute,time_frequency - -For more information, see also the Wget FAQ - -
- -
- -
- -
diff --git a/src/beaker_climate/climate_python/context.py b/src/beaker_climate/climate_python/context.py deleted file mode 100644 index 0253931..0000000 --- a/src/beaker_climate/climate_python/context.py +++ /dev/null @@ -1,138 +0,0 @@ -import logging -import os -from typing import TYPE_CHECKING, Any, Dict - -from archytas.tool_utils import LoopControllerRef - -from beaker_kernel.lib import BeakerContext -from beaker_kernel.lib.utils import intercept - -from .agent import ClimateDataUtilityAgent - -if TYPE_CHECKING: - from beaker_kernel.lib import BeakerContext - -logger = logging.getLogger(__name__) - -class ClimateDataUtilityContext(BeakerContext): - """ - Climate Data Utility Context Class - """ - - compatible_subkernels = ["python3"] - SLUG = "beaker_climate" - - def __init__(self, beaker_kernel: "BeakerKernel", config: Dict[str, Any]) -> None: - self.climate_data_utility__functions = {} - self.config = config - self.dataset_map = {} - super().__init__(beaker_kernel, ClimateDataUtilityAgent, config) - - def get_auth(self) -> tuple[str, str]: - return (os.getenv("AUTH_USERNAME", ""), os.getenv("AUTH_PASSWORD", "")) - - async def setup(self, context_info, parent_header): - self.config["context_info"] = context_info - for name, dataset in self.config["context_info"].items(): - dataset_id = dataset.get("hmi_dataset_id", None) - filename = dataset.get("filename", None) - if dataset_id is None or filename is None: - logging.error(f"failed to download dataset from initial context: {dataset}") - return - await self.download_dataset(name, dataset_id, filename) - - def reset(self): - self.dataset_map = {} - - async def auto_context(self): - intro = f""" - You are a software engineer working on a climate dataset operations tool in a Jupyter notebook. - - Your goal is to help users perform various operations on climate datasets, such as regridding NetCDF datasets and plotting/previewing NetCDF files. - Additionally, the tools provide functionality to retrieve datasets from a storage server. - - Please provide assistance to users with their queries related to climate dataset operations. - - Remember to provide accurate information and avoid guessing if you are unsure of an answer. - """ - - return intro - - @intercept() - async def download_dataset_request(self, message): - """ - This is used to download a dataset from the HMI server. - """ - - content = message.content - uuid = content.get("uuid") - filename = content.get("filename") - if filename is None: - filename = f"{uuid}.nc" - variable_name = content.get("variable_name") or "dataset_" + str(len(self.dataset_map)) - - await self.download_dataset(variable_name, uuid, filename) - - async def download_dataset(self, variable_name, hmi_dataset_id, filename): - code = self.get_code( - "hmi_dataset_download", - {"auth": self.get_auth(), "id": hmi_dataset_id, "filename": filename, "variable_name": variable_name}, - ) - - self.dataset_map[variable_name] = {"id": hmi_dataset_id, "variable_name": variable_name} - await self.execute( - code, - parent_header={}, - ) - - @intercept() - async def save_dataset_request(self, message): - """ - This tool is used to save a dataset to the HMI server. - The 'dataset' argument is the variable name of the dataset to save in the notebook environment. - """ - - content = message.content - dataset = content.get("dataset") - new_dataset_filename = content.get("filename") - - create_code = self.get_code( - "hmi_create_dataset", - { - "identifier": new_dataset_filename, - }, - ) - create_response = await self.evaluate( - create_code, - parent_header={}, - ) - - create_response_object = create_response.get("return") - - if isinstance(create_response_object, str): - return create_response_object - - id = create_response_object.get("id") - - persist_code = self.get_code( - "hmi_dataset_put", - { - "data": dataset, - "id": id, - "filename": f"{new_dataset_filename}", - "auth": self.get_auth(), - }, - ) - - result = await self.evaluate( - persist_code, - parent_header={}, - ) - - persist_status = result.get("return") - - self.beaker_kernel.send_response( - "iopub", - "save_dataset_response", - {"dataset_create_status": create_response_object, "file_upload_status": persist_status}, - ) \ No newline at end of file diff --git a/src/beaker_climate/climate_python/utils/check_gemini_cache.py b/src/beaker_climate/climate_python/utils/check_gemini_cache.py deleted file mode 100644 index 9ecbead..0000000 --- a/src/beaker_climate/climate_python/utils/check_gemini_cache.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import google.generativeai as genai -from google.generativeai import caching -import datetime -import time - -genai.configure(api_key=os.environ['GEMINI_API_KEY']) - -for c in genai.caching.CachedContent.list(): - print(c) - -CachedContent( - name='cachedContents/t2qym525vve1', - model='models/gemini-1.5-flash-8b', - display_name='api_assistant_esgf_client', - usage_metadata={ - 'total_token_count': 21279, - }, - create_time=2024-11-18 15:31:08.973119+00:00, - update_time=2024-11-18 15:31:08.973119+00:00, - expire_time=2024-11-18 16:31:08.658662+00:00 -) - -# delete a cache with -c.delete() \ No newline at end of file diff --git a/src/beaker_climate/climate_python/utils/esgf_docs.py b/src/beaker_climate/climate_python/utils/esgf_docs.py deleted file mode 100644 index 652eb44..0000000 --- a/src/beaker_climate/climate_python/utils/esgf_docs.py +++ /dev/null @@ -1,200 +0,0 @@ -import requests -import os -from pathlib import Path -import nbformat -from bs4 import BeautifulSoup -import pypandoc - -def fetch_github_content(url, raw_base="https://raw.githubusercontent.com"): - """Fetch content from GitHub, converting the regular GitHub URL to raw content URL.""" - if "github.com" in url: - parts = url.split("github.com/")[1].split("/blob/" if "/blob/" in url else "/tree/") - raw_url = f"{raw_base}/{parts[0]}/{parts[1]}" - else: - raw_url = url - - response = requests.get(raw_url) - response.raise_for_status() - return response.text - -def process_rst_to_md(rst_content): - """Convert RST content to Markdown using pypandoc with extra options.""" - try: - extra_args = [ - '--wrap=none', - '--columns=1000', - '--markdown-headings=atx', - '--tab-stop=2', - '--standalone' - ] - - md_content = pypandoc.convert_text( - rst_content, - 'gfm', - format='rst', - extra_args=extra_args - ) - return md_content - except Exception as e: - print(f"Error converting RST to MD: {e}") - return rst_content - -def process_notebook(notebook_content): - """Extract code examples and markdown from Jupyter notebook.""" - nb = nbformat.reads(notebook_content, nbformat.NO_CONVERT) - - md_output = [] - for cell in nb.cells: - if cell.cell_type == 'markdown': - md_output.append(cell.source) - elif cell.cell_type == 'code': - md_output.append(f"```python\n{cell.source}\n```") - if hasattr(cell, 'outputs') and cell.outputs: - for output in cell.outputs: - if 'text' in output: - md_output.append(f"```\n{output['text']}\n```") - elif 'data' in output and 'text/plain' in output['data']: - md_output.append(f"```\n{output['data']['text/plain']}\n```") - - return "\n\n".join(md_output) - -def fetch_esgf_api_docs(): - """Fetch and process the ESGF API documentation.""" - api_url = "https://esgf.github.io/esg-search/ESGF_Search_RESTful_API.html" - response = requests.get(api_url) - soup = BeautifulSoup(response.text, 'html.parser') - - # Extract the main content - content = soup.find('div', {'class': 'document'}) - if content: - # Convert to markdown - md_content = pypandoc.convert_text( - str(content), - 'gfm', - format='html', - extra_args=['--wrap=none', '--columns=1000'] - ) - return md_content - return "" - -def fetch_api_docs(): - """Fetch and process the ESGF Python Client API documentation.""" - api_docs_url = "https://esgf-pyclient.readthedocs.io/en/latest/api.html" - - try: - response = requests.get(api_docs_url) - soup = BeautifulSoup(response.text, 'html.parser') - - # Extract only the main content area (right side) - api_content = soup.find('div', {'class': 'body'}) # This gets just the main content - if not api_content: - return "" - - # Process the content to maintain formatting - content_md = [] - - # Add the main title - content_md.append("# API Reference\n\n") - - # Process each section - for section in api_content.find_all(['div', 'section'], recursive=False): - # Handle section titles - title = section.find('h1') or section.find('h2') or section.find('h3') - if title: - level = int(title.name[1]) # h1 -> 1, h2 -> 2, etc. - content_md.append(f"{'#' * level} {title.text.strip()}\n\n") - - # Handle code blocks - for code in section.find_all('pre'): - content_md.append(f"```python\n{code.text.strip()}\n```\n\n") - - # Handle warning boxes - for warning in section.find_all('div', {'class': 'warning'}): - content_md.append(f"> **Warning:**\n> {warning.text.strip()}\n\n") - - # Handle parameter lists - for var_list in section.find_all('dl'): - for dt, dd in zip(var_list.find_all('dt'), var_list.find_all('dd')): - param = dt.text.strip() - desc = dd.text.strip() - content_md.append(f"- **{param}** – {desc}\n") - content_md.append("\n") - - # Handle regular paragraphs - for p in section.find_all('p', recursive=False): - content_md.append(f"{p.text.strip()}\n\n") - - return "\n".join(content_md) - - except Exception as e: - print(f"Error processing API documentation: {e}") - return "" - -def fetch_and_process_docs(): - """Main function to fetch and process documentation.""" - # URLs - concepts_url = "https://github.com/ESGF/esgf-pyclient/blob/master/docs/source/concepts.rst" - api_docs_url = "https://esgf-pyclient.readthedocs.io/en/latest/api.html" - demo_notebooks_url = "https://github.com/ESGF/esgf-pyclient/tree/master/notebooks/demo" - examples_notebooks_url = "https://github.com/ESGF/esgf-pyclient/tree/master/notebooks/examples" - - # Process documentation - md_content = ["# ESGF Python Client Documentation\n\n"] - - # Process concepts.rst - try: - rst_content = fetch_github_content(concepts_url) - md_content.append(process_rst_to_md(rst_content)) - md_content.append("\n---\n") - except Exception as e: - print(f"Error processing concepts.rst: {e}") - - # Add Python Client API documentation - md_content.append("\n# ESGF Python Client API Reference\n\n") - api_docs = fetch_api_docs() - if api_docs: - md_content.append(api_docs) - md_content.append("\n---\n") - - # Process notebooks from both directories - md_content.append("\n# Code Examples from Notebooks\n\n") - - for notebooks_url in [demo_notebooks_url, examples_notebooks_url]: - try: - notebooks_response = requests.get(notebooks_url) - notebooks_soup = BeautifulSoup(notebooks_response.text, 'html.parser') - - dir_name = "Demo" if "demo" in notebooks_url else "Examples" - md_content.append(f"\n## {dir_name} Notebooks\n\n") - - for link in notebooks_soup.find_all('a', href=True): - if link['href'].endswith('.ipynb'): - notebook_url = f"https://github.com{link['href']}" - try: - notebook_content = fetch_github_content(notebook_url) - md_content.append(f"\n### {link['href'].split('/')[-1]}\n") - md_content.append(process_notebook(notebook_content)) - md_content.append("\n---\n") - except Exception as e: - print(f"Error processing notebook {notebook_url}: {e}") - except Exception as e: - print(f"Error accessing notebooks directory {notebooks_url}: {e}") - - # Add ESGF API documentation - md_content.append("\n# General Information about the ESGF API\n\n") - try: - api_docs = fetch_esgf_api_docs() - md_content.append(api_docs) - except Exception as e: - print(f"Error fetching ESGF API documentation: {e}") - - # Write to file - output_path = Path("esgf_documentation.md") - output_path.write_text("\n".join(md_content)) - - return output_path - -if __name__ == "__main__": - output_file = fetch_and_process_docs() - print(f"Documentation has been saved to {output_file}") -