Towards Task 1

- modify stage 01 to use get_summary_table_name - modify stage 01 to add support for remotes_2 - stage 3 is in dev - not working yet - stage 4 in dev - add EXPERIMENT_PATH as a place to store inventory/metadata (dataless h5s) - factor get_remotes_2 out of get_remotes - add support for summary_table filename make/load issue #252
simpeg · Jun 3, 2023 · 9b7aba0 · 9b7aba0
1 parent dcb517a
commit 9b7aba0
Show file tree

Hide file tree

Showing 4 changed files with 262 additions and 23 deletions.
diff --git a/aurora/test_utils/earthscope/01_test_load_spud_tfs.py b/aurora/test_utils/earthscope/01_test_load_spud_tfs.py
@@ -15,27 +15,28 @@
 
 from aurora.test_utils.earthscope.helpers import SPUD_XML_CSV
 from aurora.test_utils.earthscope.helpers import SPUD_XML_PATH
+from aurora.test_utils.earthscope.helpers import SUMMARY_TABLES_PATH
 from aurora.test_utils.earthscope.helpers import DATA_PATH
 from aurora.test_utils.earthscope.helpers import load_xml_tf
 from aurora.test_utils.earthscope.helpers import get_remotes_from_tf
+from aurora.test_utils.earthscope.helpers import get_remotes_from_tf_2
 from aurora.test_utils.earthscope.helpers import get_rr_type
+from aurora.test_utils.earthscope.helpers import get_summary_table_filename
 
-SPUD_DF = pd.read_csv(SPUD_XML_CSV)
-now = datetime.datetime.now().__str__().split(".")[0].replace(" ","_")
-now_str = now.replace(":","")
-SPUD_XML_REVIEW_CSV_NAME = f"spud_xml_review_{now_str}.csv"
-SPUD_XML_REVIEW_CSV_PATH = SPUD_XML_PATH.joinpath(SPUD_XML_REVIEW_CSV_NAME)
-
+STAGE_ID = 1
 
 
 def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
-                    results_csv=SPUD_XML_REVIEW_CSV_PATH):
+                    results_csv=""):
     """
 
     :param xml_source_column:"data_xml_path" or "emtf_xml_path"
     specifies which of the two possible collections of xml files to use as source
     :return:
     """
+    if not results_csv:
+        results_csv = get_summary_table_filename(STAGE_ID)
+
     t0 = time.time()
     spud_df = pd.read_csv(SPUD_XML_CSV)
 
@@ -45,6 +46,7 @@ def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
         spud_df[f"{xml_source}_error_message"] = ""
         spud_df[f"{xml_source}_remote_ref_type"] = ""
         spud_df[f"{xml_source}_remotes"] = ""
+        spud_df[f"{xml_source}_remotes_2"] = ""
 
     for i_row, row in spud_df.iterrows():
         # if i_row<750:
@@ -57,6 +59,8 @@ def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
                 spud_df[f"{xml_source}_remote_ref_type"].iat[i_row] = rr_type
                 remotes = get_remotes_from_tf(spud_tf)
                 spud_df[f"{xml_source}_remotes"].iat[i_row] = ",".join(remotes)
+                remotes2 = get_remotes_from_tf_2(spud_tf)
+                spud_df[f"{xml_source}_remotes_2"].iat[i_row] = ",".join(remotes)
 
             except Exception as e:
                 spud_df[f"{xml_source}_error"].at[i_row] = True
@@ -79,8 +83,9 @@ def main():
     # review_csv_name = "spud_xml_review_2023-05-28_13:21:18.csv"
     # review_csv_path = SPUD_XML_PATH.joinpath(review_csv_name)
     # df = pd.read_csv(review_csv)
-
-    results_df = pd.read_csv(SPUD_XML_REVIEW_CSV_PATH)
+    print("OK")
+    # review_csv_name = "spud_xml_review_2023-05-28_13:21:18.csv"
+    # results_df = pd.read_csv(review_csv_name)
     print("summarize")
 
 

diff --git a/aurora/test_utils/earthscope/03_test_station_inventory_valid.py b/aurora/test_utils/earthscope/03_test_station_inventory_valid.py
@@ -0,0 +1,183 @@
+"""
+
+Flow
+Use stage 1 output csv
+
+We will iterate over rows of the CSV, selecting only rows where the name is of the form:
+18057859_EM_MH010.xml
+uid_NETWORK_STATION.xml
+
+For each such row, we make a list of stations that were identified
+as self or RR
+
+For every station in list:
+    get metadata
+    show number of channels
+    any other pertinent information
+"""
+
+
+
+import numpy as np
+import pandas as pd
+import pathlib
+import time
+
+from matplotlib import pyplot as plt
+from pathlib import Path
+
+from aurora.sandbox.mth5_helpers import get_experiment_from_obspy_inventory
+from aurora.sandbox.mth5_helpers import mth5_from_experiment
+
+from aurora.test_utils.earthscope.helpers import build_request_df
+from aurora.test_utils.earthscope.helpers import EXPERIMENT_PATH
+from aurora.test_utils.earthscope.helpers import get_most_recent_review
+from mth5.mth5 import MTH5
+from mth5.clients import FDSN, MakeMTH5
+from mt_metadata.transfer_functions.core import TF
+from mt_metadata import TF_XML
+
+
+
+
+
+
+def metadata_check(request_df):
+    fdsn_object = FDSN(mth5_version='0.2.0')
+    fdsn_object.client = "IRIS"
+    inv = fdsn_object.get_inventory_from_df(request_df, data=False)
+    experiment = get_experiment_from_obspy_inventory(inv[0])
+    mth5 = mth5_from_experiment(experiment, 'qq.h5')
+    mth5.channel_summary.summarize()
+    channel_summary_df = mth5.channel_summary.to_dataframe()
+    return channel_summary_df
+
+#
+
+
+data_coverage_csv_name = "local_data_coverage.csv"
+data_coverage_csv_path = DATA_PATH.joinpath(data_coverage_csv_name)
+GET_REMOTES_FROM = "spud_xml_review" # tf_xml
+
+
+from aurora.test_utils.earthscope.helpers import get_most_recent_review
+
+STAGE_ID = 2
+def initialize_metadata_df():
+    local_metadata_coverage_df = pd.DataFrame(columns=["station_id", "network_id", "filename", "filesize"])
+    pass
+
+
+
+def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
+                    source_csv=None,
+                    results_csv=None):
+    """
+
+    :param xml_source_column:"data_xml_path" or "emtf_xml_path"
+    specifies which of the two possible collections of xml files to use as source
+    :return:
+    """
+    t0 = time.time()
+    if not source_csv:
+        source_csv = get_most_recent_review(1)
+    source_df = pd.read_csv(source_csv)
+
+    local_data_coverage_df = pd.read_csv(data_coverage_csv_path)
+
+    xml_source = "data_xml_path"
+    spud_csv_name = "spud_xml_review_2023-05-29_15:08:25.csv"
+    spud_csv_path = SPUD_XML_PATH.joinpath(spud_csv_name)
+    spud_df = pd.read_csv(spud_csv_path)
+
+    for i_row, row in spud_df.iterrows():
+        if row[f"{xml_source}_error"] is True:
+            print(f"Skipping {row} for now, tf not reading in")
+            continue
+
+        xml_path = pathlib.Path(row[xml_source])
+        if "__" in xml_path.name:
+            print(f"Skipping {row[xml_source]} for now, Station/network unknown")
+            continue
+
+
+        [xml_uid, network_id, station_id] = xml_path.stem.split("_")
+        extract remotes
+        if GET_REMOTES_FROM == "tf_xml":
+            tf = load_xml_tf(xml_path)
+            rr_type = get_rr_type(tf)
+            remotes = get_remotes_from_tf(tf)
+        elif GET_REMOTES_FROM == "spud_xml_review":
+            remotes = row.data_xml_path_remotes.split(",")
+        if remotes:
+            print(f"remotes: {remotes} ")
+        all_stations = remotes + [station_id,]
+
+        for station in all_stations:
+            request_df = build_request_df([station,], network_id, start=None, end=None)
+            print(request_df)
+            fdsn_object = FDSN(mth5_version='0.2.0')
+            fdsn_object.client = "IRIS"
+
+            expected_file_name = EXPERIMENT_PATH.joinpath(fdsn_object.make_filename(request_df))
+            sub_coverage_df = local_data_coverage_df[local_data_coverage_df["filename"] == str(expected_file_name)]
+            if len(sub_coverage_df):
+                print(f"Already have data for {station}-{network_id}")
+                print(f"{sub_coverage_df}")
+                continue
+            try:
+                mth5_filename = fdsn_object.make_mth5_from_fdsn_client(request_df,
+                                                                       interact=False,
+                                                                       path=DATA_PATH)
+                new_row = {"station_id": station,
+                           "network_id": network_id,
+                           "filename": mth5_filename,
+                           "filesize": mth5_filename.stat().st_size,
+                           "exception":"",
+                           "error_message":""}
+                local_data_coverage_df = local_data_coverage_df.append(new_row, ignore_index=True)
+                local_data_coverage_df.to_csv(local_data_coverage_csv, index=False)
+            except Exception as e:
+                print("")
+                new_row = {"station_id":station,
+                           "network_id":network_id,
+                           "filename":"", #expected_file_name
+                           "filesize": "", #expected_file_name.stat().st_size,
+                           "exception":e.__class__.__name__,
+                           "error_message":e.args[0]}
+                local_data_coverage_df = local_data_coverage_df.append(new_row, ignore_index=True)
+                local_data_coverage_df.to_csv(local_data_coverage_csv, index=False)
+            # ADD A ROW TO A DF AS WE GO
+        #
+        #
+        #     print("get metadata")
+        #     print("check what the filename should be")
+        #     print("Check if the file exists")
+        #     print("check if the file contains all the data")
+        #     print("if not, pull new data, and add to the file if it exists")
+
+    #
+    #     print("NOW get the RRS")
+    #
+    # if controls["review_spud_xmls_01"]:
+    #     results_df = review_spud_tfs(xml_source=SPUD_XML_COLUMN)
+    # else:
+    #     results_df = pd.read_csv(SPUD_XML_REVIEW_CSV_01)
+    # results_df = results_df[results_df.error==False]
+    # results_df.reset_index(drop=True, inplace=True)
+    # results_df2 = get_mth5_data(results_df)
+    # grouper = results_df2.groupby("survey")
+    # # get data
+    #
+    # print("OK")
+    # for i_row, row in SPUD_DF.iterrows():
+    #     xml_path = pathlib.Path(row[SPUD_XML_COLUMN])
+    #     spud_tf = load_xml_tf(xml_path)
+    #     print(row[SPUD_XML_COLUMN])
+    # pass
+
+def main():
+    batch_download()
+
+if __name__ == "__main__":
+    main()
diff --git a/...scope/02_test_download_from_earthscope.py → ...scope/04_test_download_from_earthscope.py b/...scope/02_test_download_from_earthscope.py → ...scope/04_test_download_from_earthscope.py
@@ -39,6 +39,9 @@
 from matplotlib import pyplot as plt
 from pathlib import Path
 
+from aurora.sandbox.mth5_helpers import get_experiment_from_obspy_inventory
+from aurora.sandbox.mth5_helpers import mth5_from_experiment
+
 from aurora.test_utils.earthscope.helpers import build_request_df
 from aurora.test_utils.earthscope.helpers import DATA_PATH
 from aurora.test_utils.earthscope.helpers import SPUD_DATA_PATH
@@ -68,8 +71,6 @@ def initialize_coverage_df():
 
 
 
-#
-
 
 data_coverage_csv_name = "local_data_coverage.csv"
 data_coverage_csv_path = DATA_PATH.joinpath(data_coverage_csv_name)

diff --git a/aurora/test_utils/earthscope/helpers.py b/aurora/test_utils/earthscope/helpers.py
@@ -1,3 +1,11 @@
+"""
+CACHE_PATH: This is a place where all the downloads will land, and summaray csvs will be kept
+DATA_AVAILABILITY_PATH: This is a place where information about data availability will be staged
+These are txt files generated by Laura's ipynb
+DATA_PATH: This is where the mth5 files are archived locally
+
+SPUD_XML_PATH
+"""
 import datetime
 import pathlib
 
@@ -7,6 +15,7 @@
 CACHE_PATH = HOME.joinpath(".cache").joinpath("earthscope")
 CACHE_PATH.mkdir(parents=True, exist_ok=True)
 
+# Data Availability
 DATA_AVAILABILITY_PATH = CACHE_PATH.joinpath("data_availability")
 DATA_AVAILABILITY_PATH.mkdir(parents=True, exist_ok=True)
 PUBLIC_DATA_AVAILABILITY_PATH = DATA_AVAILABILITY_PATH.joinpath("public")
@@ -15,9 +24,18 @@
 RESTRICTED_DATA_AVAILABILITY_PATH.mkdir(parents=True, exist_ok=True)
 DATA_AVAILABILITY_CSV = DATA_AVAILABILITY_PATH.joinpath("MT_acquisitions.csv")
 
+# Data (mth5s)
 DATA_PATH = CACHE_PATH.joinpath("data")
 DATA_PATH.mkdir(parents=True, exist_ok=True)
 
+# MetaData (mth5s)
+EXPERIMENT_PATH = CACHE_PATH.joinpath("experiments")
+EXPERIMENT_PATH.mkdir(parents=True, exist_ok=True)
+
+# Summary tables
+SUMMARY_TABLES_PATH = CACHE_PATH.joinpath("summary_tables")
+SUMMARY_TABLES_PATH.mkdir(parents=True, exist_ok=True)
+
 SPUD_XML_PATH = CACHE_PATH.joinpath("spud_xml")
 SPUD_XML_CSV = SPUD_XML_PATH.joinpath("spud_summary.csv")
 SPUD_EMTF_PATH = SPUD_XML_PATH.joinpath("emtf")
@@ -62,6 +80,23 @@ def get_rr_type(tf_obj):
     rr_type = rr_info_instance["remote_ref.type"]
     return rr_type
 
+
+def get_remotes_from_tf_2(tf_obj):
+    """
+    A second way to get remotes
+    :param tf_obj:
+    :return:
+    """
+    attr_name = "transfer_function.remote_references"
+    remote_references = tf_obj.station_metadata.get_attr_from_name(attr_name)
+    remotes = list()
+    for remote_station in remote_references:
+        if not len(remote_station.split('-')) > 1:
+            # if remote_station != station:
+            remotes.append(remote_station)
+    print(remote_references)
+    return remotes
+
 def get_remotes_from_tf(tf_obj):
     """
     There were 5 cases of RemoteRef type encountered when reviewing SPUD TFs
@@ -83,21 +118,11 @@ def get_remotes_from_tf(tf_obj):
         try:
             remotes = [rr_info_instance["remote_info.site.id"], ]
         except KeyError:
-            print(" No remote listed in xml")
+            print(" No remote listed in xml at expected location")
             # here an an example: https: // ds.iris.edu / spudservice / data / 14862696
             return []
-        #return remotes
     else:
-        print("hmm")
-        attr_name = "transfer_function.remote_references"
-        remote_references = tf_obj.station_metadata.get_attr_from_name(attr_name)
-        remotes = list()
-        for remote_station in remote_references:
-            if not len(remote_station.split('-')) > 1:
-                #if remote_station != station:
-                remotes.append(remote_station)
-        print(remote_references)
-    print(remotes)
+        remotes = get_remotes_from_tf_2(tf_obj)
     return remotes
 
 def build_request_df(station_ids, network_id, start=None, end=None):
@@ -108,6 +133,7 @@ def build_request_df(station_ids, network_id, start=None, end=None):
         start = '1970-01-01 00:00:00'
     if end is None:
         end = datetime.datetime.now()
+        end = end.replace(hour=0, minute=0, second=0, microsecond=0)
 
     print(station_ids)
     request_list = [[network_id, station_ids.pop(0), '', '*', start, end]]
@@ -127,3 +153,27 @@ def build_request_df(station_ids, network_id, start=None, end=None):
     return request_df
 
 
+def get_summary_table_filename(stage_number):
+    stage_number_str = str(stage_number).zfill(2)
+    now = datetime.datetime.now().__str__().split(".")[0].replace(" ", "_")
+    now_str = now.replace(":", "")
+    csv_name = f"{stage_number_str}_spud_xml_review_{now_str}.csv"
+    csv_path = SUMMARY_TABLES_PATH.joinpath(csv_name)
+    return csv_path
+
+
+def get_most_recent_review(stage_number):
+    """
+    For each stage of task 1, there is a summary table produced, and that summary table is used
+    as input for the next stage of the process.  These tables are timestamped.
+    Normally we want the most recent one, and we don't want to be pasting filenames all over the place
+    This returns the path to the most recent table.
+
+    :param stage_number:
+    :return:
+    """
+    stage_number_str = str(stage_number).zfill(2)
+    globby = SUMMARY_TABLES_PATH.glob(f"{stage_number_str}*")
+    globby = list(globby)
+    return globby[-1]
+