Skip to content

Commit

Permalink
Towards Task 1
Browse files Browse the repository at this point in the history
- modify stage 01  to use get_summary_table_name
- modify stage 01 to add support for remotes_2
- stage 3 is in dev - not working yet
- stage 4 in dev
- add EXPERIMENT_PATH as a place to store inventory/metadata (dataless h5s)
- factor get_remotes_2 out of get_remotes
- add support for summary_table filename make/load

issue #252
  • Loading branch information
kkappler committed Jun 3, 2023
1 parent dcb517a commit 9b7aba0
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 23 deletions.
23 changes: 14 additions & 9 deletions aurora/test_utils/earthscope/01_test_load_spud_tfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,27 +15,28 @@

from aurora.test_utils.earthscope.helpers import SPUD_XML_CSV
from aurora.test_utils.earthscope.helpers import SPUD_XML_PATH
from aurora.test_utils.earthscope.helpers import SUMMARY_TABLES_PATH
from aurora.test_utils.earthscope.helpers import DATA_PATH
from aurora.test_utils.earthscope.helpers import load_xml_tf
from aurora.test_utils.earthscope.helpers import get_remotes_from_tf
from aurora.test_utils.earthscope.helpers import get_remotes_from_tf_2
from aurora.test_utils.earthscope.helpers import get_rr_type
from aurora.test_utils.earthscope.helpers import get_summary_table_filename

SPUD_DF = pd.read_csv(SPUD_XML_CSV)
now = datetime.datetime.now().__str__().split(".")[0].replace(" ","_")
now_str = now.replace(":","")
SPUD_XML_REVIEW_CSV_NAME = f"spud_xml_review_{now_str}.csv"
SPUD_XML_REVIEW_CSV_PATH = SPUD_XML_PATH.joinpath(SPUD_XML_REVIEW_CSV_NAME)

STAGE_ID = 1


def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
results_csv=SPUD_XML_REVIEW_CSV_PATH):
results_csv=""):
"""
:param xml_source_column:"data_xml_path" or "emtf_xml_path"
specifies which of the two possible collections of xml files to use as source
:return:
"""
if not results_csv:
results_csv = get_summary_table_filename(STAGE_ID)

t0 = time.time()
spud_df = pd.read_csv(SPUD_XML_CSV)

Expand All @@ -45,6 +46,7 @@ def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
spud_df[f"{xml_source}_error_message"] = ""
spud_df[f"{xml_source}_remote_ref_type"] = ""
spud_df[f"{xml_source}_remotes"] = ""
spud_df[f"{xml_source}_remotes_2"] = ""

for i_row, row in spud_df.iterrows():
# if i_row<750:
Expand All @@ -57,6 +59,8 @@ def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
spud_df[f"{xml_source}_remote_ref_type"].iat[i_row] = rr_type
remotes = get_remotes_from_tf(spud_tf)
spud_df[f"{xml_source}_remotes"].iat[i_row] = ",".join(remotes)
remotes2 = get_remotes_from_tf_2(spud_tf)
spud_df[f"{xml_source}_remotes_2"].iat[i_row] = ",".join(remotes)

except Exception as e:
spud_df[f"{xml_source}_error"].at[i_row] = True
Expand All @@ -79,8 +83,9 @@ def main():
# review_csv_name = "spud_xml_review_2023-05-28_13:21:18.csv"
# review_csv_path = SPUD_XML_PATH.joinpath(review_csv_name)
# df = pd.read_csv(review_csv)

results_df = pd.read_csv(SPUD_XML_REVIEW_CSV_PATH)
print("OK")
# review_csv_name = "spud_xml_review_2023-05-28_13:21:18.csv"
# results_df = pd.read_csv(review_csv_name)
print("summarize")


Expand Down
183 changes: 183 additions & 0 deletions aurora/test_utils/earthscope/03_test_station_inventory_valid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""
Flow
Use stage 1 output csv
We will iterate over rows of the CSV, selecting only rows where the name is of the form:
18057859_EM_MH010.xml
uid_NETWORK_STATION.xml
For each such row, we make a list of stations that were identified
as self or RR
For every station in list:
get metadata
show number of channels
any other pertinent information
"""



import numpy as np
import pandas as pd
import pathlib
import time

from matplotlib import pyplot as plt
from pathlib import Path

from aurora.sandbox.mth5_helpers import get_experiment_from_obspy_inventory
from aurora.sandbox.mth5_helpers import mth5_from_experiment

from aurora.test_utils.earthscope.helpers import build_request_df
from aurora.test_utils.earthscope.helpers import EXPERIMENT_PATH
from aurora.test_utils.earthscope.helpers import get_most_recent_review
from mth5.mth5 import MTH5
from mth5.clients import FDSN, MakeMTH5
from mt_metadata.transfer_functions.core import TF
from mt_metadata import TF_XML






def metadata_check(request_df):
fdsn_object = FDSN(mth5_version='0.2.0')
fdsn_object.client = "IRIS"
inv = fdsn_object.get_inventory_from_df(request_df, data=False)
experiment = get_experiment_from_obspy_inventory(inv[0])
mth5 = mth5_from_experiment(experiment, 'qq.h5')
mth5.channel_summary.summarize()
channel_summary_df = mth5.channel_summary.to_dataframe()
return channel_summary_df

#


data_coverage_csv_name = "local_data_coverage.csv"
data_coverage_csv_path = DATA_PATH.joinpath(data_coverage_csv_name)
GET_REMOTES_FROM = "spud_xml_review" # tf_xml


from aurora.test_utils.earthscope.helpers import get_most_recent_review

STAGE_ID = 2
def initialize_metadata_df():
local_metadata_coverage_df = pd.DataFrame(columns=["station_id", "network_id", "filename", "filesize"])
pass



def review_spud_tfs(xml_sources=["emtf_xml_path", "data_xml_path"],
source_csv=None,
results_csv=None):
"""
:param xml_source_column:"data_xml_path" or "emtf_xml_path"
specifies which of the two possible collections of xml files to use as source
:return:
"""
t0 = time.time()
if not source_csv:
source_csv = get_most_recent_review(1)
source_df = pd.read_csv(source_csv)

local_data_coverage_df = pd.read_csv(data_coverage_csv_path)

xml_source = "data_xml_path"
spud_csv_name = "spud_xml_review_2023-05-29_15:08:25.csv"
spud_csv_path = SPUD_XML_PATH.joinpath(spud_csv_name)
spud_df = pd.read_csv(spud_csv_path)

for i_row, row in spud_df.iterrows():
if row[f"{xml_source}_error"] is True:
print(f"Skipping {row} for now, tf not reading in")
continue

xml_path = pathlib.Path(row[xml_source])
if "__" in xml_path.name:
print(f"Skipping {row[xml_source]} for now, Station/network unknown")
continue


[xml_uid, network_id, station_id] = xml_path.stem.split("_")
extract remotes
if GET_REMOTES_FROM == "tf_xml":
tf = load_xml_tf(xml_path)
rr_type = get_rr_type(tf)
remotes = get_remotes_from_tf(tf)
elif GET_REMOTES_FROM == "spud_xml_review":
remotes = row.data_xml_path_remotes.split(",")
if remotes:
print(f"remotes: {remotes} ")
all_stations = remotes + [station_id,]

for station in all_stations:
request_df = build_request_df([station,], network_id, start=None, end=None)
print(request_df)
fdsn_object = FDSN(mth5_version='0.2.0')
fdsn_object.client = "IRIS"

expected_file_name = EXPERIMENT_PATH.joinpath(fdsn_object.make_filename(request_df))
sub_coverage_df = local_data_coverage_df[local_data_coverage_df["filename"] == str(expected_file_name)]
if len(sub_coverage_df):
print(f"Already have data for {station}-{network_id}")
print(f"{sub_coverage_df}")
continue
try:
mth5_filename = fdsn_object.make_mth5_from_fdsn_client(request_df,
interact=False,
path=DATA_PATH)
new_row = {"station_id": station,
"network_id": network_id,
"filename": mth5_filename,
"filesize": mth5_filename.stat().st_size,
"exception":"",
"error_message":""}
local_data_coverage_df = local_data_coverage_df.append(new_row, ignore_index=True)
local_data_coverage_df.to_csv(local_data_coverage_csv, index=False)
except Exception as e:
print("")
new_row = {"station_id":station,
"network_id":network_id,
"filename":"", #expected_file_name
"filesize": "", #expected_file_name.stat().st_size,
"exception":e.__class__.__name__,
"error_message":e.args[0]}
local_data_coverage_df = local_data_coverage_df.append(new_row, ignore_index=True)
local_data_coverage_df.to_csv(local_data_coverage_csv, index=False)
# ADD A ROW TO A DF AS WE GO
#
#
# print("get metadata")
# print("check what the filename should be")
# print("Check if the file exists")
# print("check if the file contains all the data")
# print("if not, pull new data, and add to the file if it exists")

#
# print("NOW get the RRS")
#
# if controls["review_spud_xmls_01"]:
# results_df = review_spud_tfs(xml_source=SPUD_XML_COLUMN)
# else:
# results_df = pd.read_csv(SPUD_XML_REVIEW_CSV_01)
# results_df = results_df[results_df.error==False]
# results_df.reset_index(drop=True, inplace=True)
# results_df2 = get_mth5_data(results_df)
# grouper = results_df2.groupby("survey")
# # get data
#
# print("OK")
# for i_row, row in SPUD_DF.iterrows():
# xml_path = pathlib.Path(row[SPUD_XML_COLUMN])
# spud_tf = load_xml_tf(xml_path)
# print(row[SPUD_XML_COLUMN])
# pass

def main():
batch_download()

if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@
from matplotlib import pyplot as plt
from pathlib import Path

from aurora.sandbox.mth5_helpers import get_experiment_from_obspy_inventory
from aurora.sandbox.mth5_helpers import mth5_from_experiment

from aurora.test_utils.earthscope.helpers import build_request_df
from aurora.test_utils.earthscope.helpers import DATA_PATH
from aurora.test_utils.earthscope.helpers import SPUD_DATA_PATH
Expand Down Expand Up @@ -68,8 +71,6 @@ def initialize_coverage_df():



#


data_coverage_csv_name = "local_data_coverage.csv"
data_coverage_csv_path = DATA_PATH.joinpath(data_coverage_csv_name)
Expand Down
74 changes: 62 additions & 12 deletions aurora/test_utils/earthscope/helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
"""
CACHE_PATH: This is a place where all the downloads will land, and summaray csvs will be kept
DATA_AVAILABILITY_PATH: This is a place where information about data availability will be staged
These are txt files generated by Laura's ipynb
DATA_PATH: This is where the mth5 files are archived locally
SPUD_XML_PATH
"""
import datetime
import pathlib

Expand All @@ -7,6 +15,7 @@
CACHE_PATH = HOME.joinpath(".cache").joinpath("earthscope")
CACHE_PATH.mkdir(parents=True, exist_ok=True)

# Data Availability
DATA_AVAILABILITY_PATH = CACHE_PATH.joinpath("data_availability")
DATA_AVAILABILITY_PATH.mkdir(parents=True, exist_ok=True)
PUBLIC_DATA_AVAILABILITY_PATH = DATA_AVAILABILITY_PATH.joinpath("public")
Expand All @@ -15,9 +24,18 @@
RESTRICTED_DATA_AVAILABILITY_PATH.mkdir(parents=True, exist_ok=True)
DATA_AVAILABILITY_CSV = DATA_AVAILABILITY_PATH.joinpath("MT_acquisitions.csv")

# Data (mth5s)
DATA_PATH = CACHE_PATH.joinpath("data")
DATA_PATH.mkdir(parents=True, exist_ok=True)

# MetaData (mth5s)
EXPERIMENT_PATH = CACHE_PATH.joinpath("experiments")
EXPERIMENT_PATH.mkdir(parents=True, exist_ok=True)

# Summary tables
SUMMARY_TABLES_PATH = CACHE_PATH.joinpath("summary_tables")
SUMMARY_TABLES_PATH.mkdir(parents=True, exist_ok=True)

SPUD_XML_PATH = CACHE_PATH.joinpath("spud_xml")
SPUD_XML_CSV = SPUD_XML_PATH.joinpath("spud_summary.csv")
SPUD_EMTF_PATH = SPUD_XML_PATH.joinpath("emtf")
Expand Down Expand Up @@ -62,6 +80,23 @@ def get_rr_type(tf_obj):
rr_type = rr_info_instance["remote_ref.type"]
return rr_type


def get_remotes_from_tf_2(tf_obj):
"""
A second way to get remotes
:param tf_obj:
:return:
"""
attr_name = "transfer_function.remote_references"
remote_references = tf_obj.station_metadata.get_attr_from_name(attr_name)
remotes = list()
for remote_station in remote_references:
if not len(remote_station.split('-')) > 1:
# if remote_station != station:
remotes.append(remote_station)
print(remote_references)
return remotes

def get_remotes_from_tf(tf_obj):
"""
There were 5 cases of RemoteRef type encountered when reviewing SPUD TFs
Expand All @@ -83,21 +118,11 @@ def get_remotes_from_tf(tf_obj):
try:
remotes = [rr_info_instance["remote_info.site.id"], ]
except KeyError:
print(" No remote listed in xml")
print(" No remote listed in xml at expected location")
# here an an example: https: // ds.iris.edu / spudservice / data / 14862696
return []
#return remotes
else:
print("hmm")
attr_name = "transfer_function.remote_references"
remote_references = tf_obj.station_metadata.get_attr_from_name(attr_name)
remotes = list()
for remote_station in remote_references:
if not len(remote_station.split('-')) > 1:
#if remote_station != station:
remotes.append(remote_station)
print(remote_references)
print(remotes)
remotes = get_remotes_from_tf_2(tf_obj)
return remotes

def build_request_df(station_ids, network_id, start=None, end=None):
Expand All @@ -108,6 +133,7 @@ def build_request_df(station_ids, network_id, start=None, end=None):
start = '1970-01-01 00:00:00'
if end is None:
end = datetime.datetime.now()
end = end.replace(hour=0, minute=0, second=0, microsecond=0)

print(station_ids)
request_list = [[network_id, station_ids.pop(0), '', '*', start, end]]
Expand All @@ -127,3 +153,27 @@ def build_request_df(station_ids, network_id, start=None, end=None):
return request_df


def get_summary_table_filename(stage_number):
stage_number_str = str(stage_number).zfill(2)
now = datetime.datetime.now().__str__().split(".")[0].replace(" ", "_")
now_str = now.replace(":", "")
csv_name = f"{stage_number_str}_spud_xml_review_{now_str}.csv"
csv_path = SUMMARY_TABLES_PATH.joinpath(csv_name)
return csv_path


def get_most_recent_review(stage_number):
"""
For each stage of task 1, there is a summary table produced, and that summary table is used
as input for the next stage of the process. These tables are timestamped.
Normally we want the most recent one, and we don't want to be pasting filenames all over the place
This returns the path to the most recent table.
:param stage_number:
:return:
"""
stage_number_str = str(stage_number).zfill(2)
globby = SUMMARY_TABLES_PATH.glob(f"{stage_number_str}*")
globby = list(globby)
return globby[-1]

0 comments on commit 9b7aba0

Please sign in to comment.