From 9b7aba0b5f22cf57744d1d67eef23b74e3b81847 Mon Sep 17 00:00:00 2001 From: "Karl N. Kappler" Date: Sat, 3 Jun 2023 13:07:52 -0700 Subject: [PATCH] Towards Task 1 - modify stage 01 to use get_summary_table_name - modify stage 01 to add support for remotes_2 - stage 3 is in dev - not working yet - stage 4 in dev - add EXPERIMENT_PATH as a place to store inventory/metadata (dataless h5s) - factor get_remotes_2 out of get_remotes - add support for summary_table filename make/load issue #252 --- .../earthscope/01_test_load_spud_tfs.py | 23 ++- .../03_test_station_inventory_valid.py | 183 ++++++++++++++++++ ...py => 04_test_download_from_earthscope.py} | 5 +- aurora/test_utils/earthscope/helpers.py | 74 +++++-- 4 files changed, 262 insertions(+), 23 deletions(-) create mode 100644 aurora/test_utils/earthscope/03_test_station_inventory_valid.py rename aurora/test_utils/earthscope/{02_test_download_from_earthscope.py => 04_test_download_from_earthscope.py} (97%) diff --git a/aurora/test_utils/earthscope/01_test_load_spud_tfs.py b/aurora/test_utils/earthscope/01_test_load_spud_tfs.py index 0c424188..c3faae62 100644 --- a/aurora/test_utils/earthscope/01_test_load_spud_tfs.py +++ b/aurora/test_utils/earthscope/01_test_load_spud_tfs.py @@ -15,27 +15,28 @@ from aurora.test_utils.earthscope.helpers import SPUD_XML_CSV from aurora.test_utils.earthscope.helpers import SPUD_XML_PATH +from aurora.test_utils.earthscope.helpers import SUMMARY_TABLES_PATH from aurora.test_utils.earthscope.helpers import DATA_PATH from aurora.test_utils.earthscope.helpers import load_xml_tf from aurora.test_utils.earthscope.helpers import get_remotes_from_tf +from aurora.test_utils.earthscope.helpers import get_remotes_from_tf_2 from aurora.test_utils.earthscope.helpers import get_rr_type +from aurora.test_utils.earthscope.helpers import get_summary_table_filename -SPUD_DF = pd.read_csv(SPUD_XML_CSV) -now = datetime.datetime.now().__str__().split(".")[0].replace(" ","_") -now_str = now.replace(":","") -SPUD_XML_REVIEW_CSV_NAME = f"spud_xml_review_{now_str}.csv" -SPUD_XML_REVIEW_CSV_PATH = SPUD_XML_PATH.joinpath(SPUD_XML_REVIEW_CSV_NAME) - +STAGE_ID = 1 def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"], - results_csv=SPUD_XML_REVIEW_CSV_PATH): + results_csv=""): """ :param xml_source_column:"data_xml_path" or "emtf_xml_path" specifies which of the two possible collections of xml files to use as source :return: """ + if not results_csv: + results_csv = get_summary_table_filename(STAGE_ID) + t0 = time.time() spud_df = pd.read_csv(SPUD_XML_CSV) @@ -45,6 +46,7 @@ def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"], spud_df[f"{xml_source}_error_message"] = "" spud_df[f"{xml_source}_remote_ref_type"] = "" spud_df[f"{xml_source}_remotes"] = "" + spud_df[f"{xml_source}_remotes_2"] = "" for i_row, row in spud_df.iterrows(): # if i_row<750: @@ -57,6 +59,8 @@ def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"], spud_df[f"{xml_source}_remote_ref_type"].iat[i_row] = rr_type remotes = get_remotes_from_tf(spud_tf) spud_df[f"{xml_source}_remotes"].iat[i_row] = ",".join(remotes) + remotes2 = get_remotes_from_tf_2(spud_tf) + spud_df[f"{xml_source}_remotes_2"].iat[i_row] = ",".join(remotes) except Exception as e: spud_df[f"{xml_source}_error"].at[i_row] = True @@ -79,8 +83,9 @@ def main(): # review_csv_name = "spud_xml_review_2023-05-28_13:21:18.csv" # review_csv_path = SPUD_XML_PATH.joinpath(review_csv_name) # df = pd.read_csv(review_csv) - - results_df = pd.read_csv(SPUD_XML_REVIEW_CSV_PATH) + print("OK") + # review_csv_name = "spud_xml_review_2023-05-28_13:21:18.csv" + # results_df = pd.read_csv(review_csv_name) print("summarize") diff --git a/aurora/test_utils/earthscope/03_test_station_inventory_valid.py b/aurora/test_utils/earthscope/03_test_station_inventory_valid.py new file mode 100644 index 00000000..409ce635 --- /dev/null +++ b/aurora/test_utils/earthscope/03_test_station_inventory_valid.py @@ -0,0 +1,183 @@ +""" + +Flow +Use stage 1 output csv + +We will iterate over rows of the CSV, selecting only rows where the name is of the form: +18057859_EM_MH010.xml +uid_NETWORK_STATION.xml + +For each such row, we make a list of stations that were identified +as self or RR + +For every station in list: + get metadata + show number of channels + any other pertinent information +""" + + + +import numpy as np +import pandas as pd +import pathlib +import time + +from matplotlib import pyplot as plt +from pathlib import Path + +from aurora.sandbox.mth5_helpers import get_experiment_from_obspy_inventory +from aurora.sandbox.mth5_helpers import mth5_from_experiment + +from aurora.test_utils.earthscope.helpers import build_request_df +from aurora.test_utils.earthscope.helpers import EXPERIMENT_PATH +from aurora.test_utils.earthscope.helpers import get_most_recent_review +from mth5.mth5 import MTH5 +from mth5.clients import FDSN, MakeMTH5 +from mt_metadata.transfer_functions.core import TF +from mt_metadata import TF_XML + + + + + + +def metadata_check(request_df): + fdsn_object = FDSN(mth5_version='0.2.0') + fdsn_object.client = "IRIS" + inv = fdsn_object.get_inventory_from_df(request_df, data=False) + experiment = get_experiment_from_obspy_inventory(inv[0]) + mth5 = mth5_from_experiment(experiment, 'qq.h5') + mth5.channel_summary.summarize() + channel_summary_df = mth5.channel_summary.to_dataframe() + return channel_summary_df + +# + + +data_coverage_csv_name = "local_data_coverage.csv" +data_coverage_csv_path = DATA_PATH.joinpath(data_coverage_csv_name) +GET_REMOTES_FROM = "spud_xml_review" # tf_xml + + +from aurora.test_utils.earthscope.helpers import get_most_recent_review + +STAGE_ID = 2 +def initialize_metadata_df(): + local_metadata_coverage_df = pd.DataFrame(columns=["station_id", "network_id", "filename", "filesize"]) + pass + + + +def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"], + source_csv=None, + results_csv=None): + """ + + :param xml_source_column:"data_xml_path" or "emtf_xml_path" + specifies which of the two possible collections of xml files to use as source + :return: + """ + t0 = time.time() + if not source_csv: + source_csv = get_most_recent_review(1) + source_df = pd.read_csv(source_csv) + + local_data_coverage_df = pd.read_csv(data_coverage_csv_path) + + xml_source = "data_xml_path" + spud_csv_name = "spud_xml_review_2023-05-29_15:08:25.csv" + spud_csv_path = SPUD_XML_PATH.joinpath(spud_csv_name) + spud_df = pd.read_csv(spud_csv_path) + + for i_row, row in spud_df.iterrows(): + if row[f"{xml_source}_error"] is True: + print(f"Skipping {row} for now, tf not reading in") + continue + + xml_path = pathlib.Path(row[xml_source]) + if "__" in xml_path.name: + print(f"Skipping {row[xml_source]} for now, Station/network unknown") + continue + + + [xml_uid, network_id, station_id] = xml_path.stem.split("_") + extract remotes + if GET_REMOTES_FROM == "tf_xml": + tf = load_xml_tf(xml_path) + rr_type = get_rr_type(tf) + remotes = get_remotes_from_tf(tf) + elif GET_REMOTES_FROM == "spud_xml_review": + remotes = row.data_xml_path_remotes.split(",") + if remotes: + print(f"remotes: {remotes} ") + all_stations = remotes + [station_id,] + + for station in all_stations: + request_df = build_request_df([station,], network_id, start=None, end=None) + print(request_df) + fdsn_object = FDSN(mth5_version='0.2.0') + fdsn_object.client = "IRIS" + + expected_file_name = EXPERIMENT_PATH.joinpath(fdsn_object.make_filename(request_df)) + sub_coverage_df = local_data_coverage_df[local_data_coverage_df["filename"] == str(expected_file_name)] + if len(sub_coverage_df): + print(f"Already have data for {station}-{network_id}") + print(f"{sub_coverage_df}") + continue + try: + mth5_filename = fdsn_object.make_mth5_from_fdsn_client(request_df, + interact=False, + path=DATA_PATH) + new_row = {"station_id": station, + "network_id": network_id, + "filename": mth5_filename, + "filesize": mth5_filename.stat().st_size, + "exception":"", + "error_message":""} + local_data_coverage_df = local_data_coverage_df.append(new_row, ignore_index=True) + local_data_coverage_df.to_csv(local_data_coverage_csv, index=False) + except Exception as e: + print("") + new_row = {"station_id":station, + "network_id":network_id, + "filename":"", #expected_file_name + "filesize": "", #expected_file_name.stat().st_size, + "exception":e.__class__.__name__, + "error_message":e.args[0]} + local_data_coverage_df = local_data_coverage_df.append(new_row, ignore_index=True) + local_data_coverage_df.to_csv(local_data_coverage_csv, index=False) + # ADD A ROW TO A DF AS WE GO + # + # + # print("get metadata") + # print("check what the filename should be") + # print("Check if the file exists") + # print("check if the file contains all the data") + # print("if not, pull new data, and add to the file if it exists") + + # + # print("NOW get the RRS") + # + # if controls["review_spud_xmls_01"]: + # results_df = review_spud_tfs(xml_source=SPUD_XML_COLUMN) + # else: + # results_df = pd.read_csv(SPUD_XML_REVIEW_CSV_01) + # results_df = results_df[results_df.error==False] + # results_df.reset_index(drop=True, inplace=True) + # results_df2 = get_mth5_data(results_df) + # grouper = results_df2.groupby("survey") + # # get data + # + # print("OK") + # for i_row, row in SPUD_DF.iterrows(): + # xml_path = pathlib.Path(row[SPUD_XML_COLUMN]) + # spud_tf = load_xml_tf(xml_path) + # print(row[SPUD_XML_COLUMN]) + # pass + +def main(): + batch_download() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/aurora/test_utils/earthscope/02_test_download_from_earthscope.py b/aurora/test_utils/earthscope/04_test_download_from_earthscope.py similarity index 97% rename from aurora/test_utils/earthscope/02_test_download_from_earthscope.py rename to aurora/test_utils/earthscope/04_test_download_from_earthscope.py index cab8f856..1db4a05f 100644 --- a/aurora/test_utils/earthscope/02_test_download_from_earthscope.py +++ b/aurora/test_utils/earthscope/04_test_download_from_earthscope.py @@ -39,6 +39,9 @@ from matplotlib import pyplot as plt from pathlib import Path +from aurora.sandbox.mth5_helpers import get_experiment_from_obspy_inventory +from aurora.sandbox.mth5_helpers import mth5_from_experiment + from aurora.test_utils.earthscope.helpers import build_request_df from aurora.test_utils.earthscope.helpers import DATA_PATH from aurora.test_utils.earthscope.helpers import SPUD_DATA_PATH @@ -68,8 +71,6 @@ def initialize_coverage_df(): -# - data_coverage_csv_name = "local_data_coverage.csv" data_coverage_csv_path = DATA_PATH.joinpath(data_coverage_csv_name) diff --git a/aurora/test_utils/earthscope/helpers.py b/aurora/test_utils/earthscope/helpers.py index d5b1f559..917a2d9d 100644 --- a/aurora/test_utils/earthscope/helpers.py +++ b/aurora/test_utils/earthscope/helpers.py @@ -1,3 +1,11 @@ +""" +CACHE_PATH: This is a place where all the downloads will land, and summaray csvs will be kept +DATA_AVAILABILITY_PATH: This is a place where information about data availability will be staged +These are txt files generated by Laura's ipynb +DATA_PATH: This is where the mth5 files are archived locally + +SPUD_XML_PATH +""" import datetime import pathlib @@ -7,6 +15,7 @@ CACHE_PATH = HOME.joinpath(".cache").joinpath("earthscope") CACHE_PATH.mkdir(parents=True, exist_ok=True) +# Data Availability DATA_AVAILABILITY_PATH = CACHE_PATH.joinpath("data_availability") DATA_AVAILABILITY_PATH.mkdir(parents=True, exist_ok=True) PUBLIC_DATA_AVAILABILITY_PATH = DATA_AVAILABILITY_PATH.joinpath("public") @@ -15,9 +24,18 @@ RESTRICTED_DATA_AVAILABILITY_PATH.mkdir(parents=True, exist_ok=True) DATA_AVAILABILITY_CSV = DATA_AVAILABILITY_PATH.joinpath("MT_acquisitions.csv") +# Data (mth5s) DATA_PATH = CACHE_PATH.joinpath("data") DATA_PATH.mkdir(parents=True, exist_ok=True) +# MetaData (mth5s) +EXPERIMENT_PATH = CACHE_PATH.joinpath("experiments") +EXPERIMENT_PATH.mkdir(parents=True, exist_ok=True) + +# Summary tables +SUMMARY_TABLES_PATH = CACHE_PATH.joinpath("summary_tables") +SUMMARY_TABLES_PATH.mkdir(parents=True, exist_ok=True) + SPUD_XML_PATH = CACHE_PATH.joinpath("spud_xml") SPUD_XML_CSV = SPUD_XML_PATH.joinpath("spud_summary.csv") SPUD_EMTF_PATH = SPUD_XML_PATH.joinpath("emtf") @@ -62,6 +80,23 @@ def get_rr_type(tf_obj): rr_type = rr_info_instance["remote_ref.type"] return rr_type + +def get_remotes_from_tf_2(tf_obj): + """ + A second way to get remotes + :param tf_obj: + :return: + """ + attr_name = "transfer_function.remote_references" + remote_references = tf_obj.station_metadata.get_attr_from_name(attr_name) + remotes = list() + for remote_station in remote_references: + if not len(remote_station.split('-')) > 1: + # if remote_station != station: + remotes.append(remote_station) + print(remote_references) + return remotes + def get_remotes_from_tf(tf_obj): """ There were 5 cases of RemoteRef type encountered when reviewing SPUD TFs @@ -83,21 +118,11 @@ def get_remotes_from_tf(tf_obj): try: remotes = [rr_info_instance["remote_info.site.id"], ] except KeyError: - print(" No remote listed in xml") + print(" No remote listed in xml at expected location") # here an an example: https: // ds.iris.edu / spudservice / data / 14862696 return [] - #return remotes else: - print("hmm") - attr_name = "transfer_function.remote_references" - remote_references = tf_obj.station_metadata.get_attr_from_name(attr_name) - remotes = list() - for remote_station in remote_references: - if not len(remote_station.split('-')) > 1: - #if remote_station != station: - remotes.append(remote_station) - print(remote_references) - print(remotes) + remotes = get_remotes_from_tf_2(tf_obj) return remotes def build_request_df(station_ids, network_id, start=None, end=None): @@ -108,6 +133,7 @@ def build_request_df(station_ids, network_id, start=None, end=None): start = '1970-01-01 00:00:00' if end is None: end = datetime.datetime.now() + end = end.replace(hour=0, minute=0, second=0, microsecond=0) print(station_ids) request_list = [[network_id, station_ids.pop(0), '', '*', start, end]] @@ -127,3 +153,27 @@ def build_request_df(station_ids, network_id, start=None, end=None): return request_df +def get_summary_table_filename(stage_number): + stage_number_str = str(stage_number).zfill(2) + now = datetime.datetime.now().__str__().split(".")[0].replace(" ", "_") + now_str = now.replace(":", "") + csv_name = f"{stage_number_str}_spud_xml_review_{now_str}.csv" + csv_path = SUMMARY_TABLES_PATH.joinpath(csv_name) + return csv_path + + +def get_most_recent_review(stage_number): + """ + For each stage of task 1, there is a summary table produced, and that summary table is used + as input for the next stage of the process. These tables are timestamped. + Normally we want the most recent one, and we don't want to be pasting filenames all over the place + This returns the path to the most recent table. + + :param stage_number: + :return: + """ + stage_number_str = str(stage_number).zfill(2) + globby = SUMMARY_TABLES_PATH.glob(f"{stage_number_str}*") + globby = list(globby) + return globby[-1] +