Skip to content

Commit

Permalink
Add configuration file rather than constant scattered in code base
Browse files Browse the repository at this point in the history
  • Loading branch information
franckalbinet committed Jun 24, 2020
1 parent 7f120bb commit b5a8141
Show file tree
Hide file tree
Showing 16 changed files with 6,251 additions and 1,110 deletions.
29 changes: 29 additions & 0 deletions config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Spectrai Configuration

# ~/data_spectrai/
# |__ raw/
# | |__ astorga_arg_2018/
# | |__ spectra/
# | |__ measurements/2015-xrf-results-mean-and-errors.xls
# | |__ schmitter_vnm_2010/
# | |__ mir-models/
# |__ 20090215-soil-database-mirs.xls
# | |__ vietnam-petra/
# |__ kssl/
# |__ normalized/
# |__ spectra/

[DATA_ASTORGA_ARG]
SPECTRA = "~/data_spectrai/raw/astorga_arg_2018/spectra"
MEASUREMENTS = "~/data_spectrai/raw/astorga_arg_2018/measurements/2015-xrf-results-mean-and-errors.xls"

[DATA_SCHMITTER_VNM]
SPECTRA = "~/data_spectrai/raw/schmitter_vnm_2010/mir-models"
SPECTRA_REP = "~/data_spectrai/raw/schmitter_vnm_2010/vietnam_petra"
MEASUREMENTS = "~/data_spectrai/raw/schmitter_vnm_2010/mir-models/20090215-soil-database-mirs.xls"

[DATA_KSSL]
HOME = "~/data_spectrai/kssl"
NORM = "~/data_spectrai/kssl/normalized"
SPECTRA = "~/data_spectrai/kssl/spectra"
DB_NAME = "All_Spectra_Access_Portable 2-20-20.accdb"
2,685 changes: 2,685 additions & 0 deletions examples/_legacy/.ipynb_checkpoints/kssl-predictions-playgound-checkpoint.ipynb

Large diffs are not rendered by default.

2,823 changes: 2,823 additions & 0 deletions examples/_legacy/kssl-predictions-playgound.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/_legacy/pca.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
"version": "3.7.6"
}
},
"nbformat": 4,
Expand Down

Large diffs are not rendered by default.

843 changes: 312 additions & 531 deletions examples/data_loading/0-importing-data.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

setup(
name='spectrai',
version='0.0.2',
version='0.0.3',
description='Assessing the potential of AI for spectroscopy and MIR one in particular',
author='Franck Albinet',
author_email='franckalbinet@gmail.com',
packages = find_packages(),
packages=find_packages(),
install_requires=['']
)
43 changes: 43 additions & 0 deletions spectrai/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pathlib import Path
import toml


#
# Config
#
def load_config(path=None):
"""Loads as dictionary from configuration file."""
default_path = Path('~/.spectrai_config/config.toml').expanduser()
if not path and default_path.is_file():
path = default_path
assert path, 'Either path argument or ~/.spectrai_config/config.toml is required.'

config = toml.load(path)
assert config, "Unable to parse config file"

return config


def get_astorga_config():
config = load_config()['DATA_ASTORGA_ARG']
config = path_expand(config)
return (config['SPECTRA'], config['MEASUREMENTS'])


def get_schmitter_config():
config = load_config()['DATA_SCHMITTER_VNM']
config = path_expand(config)
return (config['SPECTRA'], config['SPECTRA_REP'], config['MEASUREMENTS'])


def get_kssl_config():
config = load_config()['DATA_KSSL']
config = path_expand(config, exclude = ['DB_NAME'])
return (config['HOME'], config['NORM'], config['SPECTRA'], config['DB_NAME'])


def path_expand(config, exclude=[]):
for k, v in config.items():
if k not in exclude:
config[k] = Path(v).expanduser()
return config
14 changes: 7 additions & 7 deletions spectrai/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from .astorga_arg import load_data_astorga_arg
from .astorga_arg import load_spectra_astorga_arg
from .astorga_arg import load_measurements_astorga_arg
from .astorga_arg import load_data as load_data_astorga_arg
from .astorga_arg import load_spectra as load_spectra_astorga_arg
from .astorga_arg import load_measurements as load_measurements_astorga_arg

from .schmitter_vnm import load_data_schmitter_vnm
from .schmitter_vnm import load_spectra_schmitter_vnm
from .schmitter_vnm import load_spectra_rep_schmitter_vnm
from .schmitter_vnm import load_measurements_schmitter_vnm
from .schmitter_vnm import load_data as load_data_schmitter_vnm
from .schmitter_vnm import load_spectra as load_spectra_schmitter_vnm
from .schmitter_vnm import load_spectra_rep as load_spectra_rep_schmitter_vnm
from .schmitter_vnm import load_measurements as load_measurements_schmitter_vnm

from .kssl import access_to_csv

Expand Down
21 changes: 9 additions & 12 deletions spectrai/datasets/astorga_arg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@
import re
import pandas as pd
from os.path import join
from .base import DATA_RAW
from spectrai.core import get_astorga_config


DATA_FOLDER = DATA_RAW / 'astorga_arg_2018'
DATA_SPECTRA = DATA_FOLDER / 'spectra'
DATA_MEASUREMENTS = DATA_FOLDER / 'measurements' / \
'2015-xrf-results-mean-and-errors.xls'
DATA_SPECTRA, DATA_MEASUREMENTS = get_astorga_config()


def load_spectra_astorga_arg(path=DATA_SPECTRA):
def load_spectra(path=DATA_SPECTRA):
""" Returns DRIFT/MIRs spectra, Romina's data, Argentina, 2015"""
path = Path(path)
df_list = []
Expand All @@ -26,22 +23,22 @@ def load_spectra_astorga_arg(path=DATA_SPECTRA):
return df[sorted(df.columns)].sort_index(ascending=False)


def load_measurements_astorga_arg(path=DATA_MEASUREMENTS,
analytes=['Fe', 'Ti', 'Ca', 'P', 'Ba']):
def load_measurements(path=DATA_MEASUREMENTS,
analytes=['Fe', 'Ti', 'Ca', 'P', 'Ba']):
""" Returns XRF measurements of soil samples, Argentina, 2015"""
path = Path(path)
df_labels = pd.read_excel(path, sheet_name='XRF contents FINAL')
df_labels.drop([0, 31], axis=0, inplace=True)
return df_labels[['Arg Code'] + analytes]


def load_data_astorga_arg(path_X=DATA_SPECTRA,
path_y=DATA_MEASUREMENTS):
def load_data(path_X=DATA_SPECTRA,
path_y=DATA_MEASUREMENTS):
""" Returns all available data amenable to DL models as numpy arrays."""
path_X = Path(path_X)
path_y = Path(path_y)
X = load_spectra_astorga_arg(path_X)
y = load_measurements_astorga_arg(path_y)
X = load_spectra(path_X)
y = load_measurements(path_y)

X_names = X.index.values
instances_id = X.columns.values
Expand Down
6 changes: 0 additions & 6 deletions spectrai/datasets/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
from pathlib import Path


DATA_HOME = Path.home() / Path('data_spectrai')
DATA_RAW = DATA_HOME / Path('raw')


def select_rows(df, where):
"""Perform a series of rows selection in a DataFrame
Expand Down
46 changes: 32 additions & 14 deletions spectrai/datasets/kssl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,16 @@
"""
import subprocess
from pathlib import Path
from .base import DATA_HOME, select_rows
# from .base import DATA_HOME, select_rows
from .base import select_rows
from spectrai.core import get_kssl_config
import pandas as pd
import re
import opusFC # Ref.: https://stuart-cls.github.io/python-opusfc-dist/
from tqdm import tqdm


DATA_KSSL = DATA_HOME / 'kssl'
DATA_NORM = DATA_KSSL / 'normalized'
DATA_SPECTRA = DATA_KSSL / 'spectra'
DB_NAME = 'All_Spectra_Access_Portable 2-20-20.accdb'
DATA_KSSL, DATA_NORM, DATA_SPECTRA, DB_NAME = get_kssl_config()


def access_to_csv(in_folder=None, out_folder=DATA_NORM, db_name=DB_NAME):
Expand Down Expand Up @@ -239,6 +238,16 @@ def load_taxonomy(in_folder=DATA_KSSL):
.replace({'mollisol': 'mollisols'})


def get_tax_orders_lookup_tbl(order_to_int=True):
df = load_taxonomy()
orders = df['taxonomic_order'].unique()
idx = range(len(orders))
key_values = zip(orders, idx)
if not order_to_int:
key_values = zip(idx, orders)
return dict(key_values)


def load_fact_tbl(in_folder=DATA_KSSL):
return pd.read_csv(in_folder / 'sample_analysis_fact_tbl.csv')

Expand All @@ -252,15 +261,24 @@ def get_analytes_like(substring='otas'):
return df[df['analyte_name'].str.contains(substring)]


def load_target(analyte=[725]):
def load_target(analytes=[725]):
df = load_fact_tbl()
df = df[df['analyte_id'].isin(analyte)]
df = df[df['analyte_id'].isin(analytes)]
df_tax = load_taxonomy()[['lims_pedon_id', 'taxonomic_order']]
df = df.merge(df_tax, on='lims_pedon_id', how='left')
return df[['smp_id', 'taxonomic_order', 'calc_value']]


def load_data_kssl(in_folder=DATA_KSSL, analyte=[]):
# merge spectra-target
# return as tuples (X, y, X_names, y_names)
pass
df['order_id'] = df['taxonomic_order'].map(get_tax_orders_lookup_tbl())
return df[['smp_id', 'lay_depth_to_top', 'order_id', 'calc_value']]


def load_data(in_folder=DATA_KSSL, analytes=[725], shuffle=True):
df_target = load_target(analytes)
df_spectra = load_spectra()
df = df_target.merge(df_spectra, on='smp_id')
if shuffle:
df = df.sample(frac=1)
X_names = df_spectra.iloc[:, 1:].columns.values.astype('int32')
y_names = df.iloc[:, 1:4].columns.values
instances_id = df['smp_id'].values
X = df.iloc[:, 4:].to_numpy('float32')
y = df.iloc[:, 1:4].to_numpy()
return (X, X_names, y, y_names, instances_id)
28 changes: 15 additions & 13 deletions spectrai/datasets/schmitter_vnm.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
from pathlib import Path
import re
import pandas as pd
from .base import DATA_RAW
from spectrai.core import get_schmitter_config
import brukeropusreader


DATA_FOLDER = DATA_RAW / 'schmitter_vnm_2010'
DATA_SPECTRA = DATA_FOLDER / 'mir-models'
DATA_SPECTRA_REP = DATA_FOLDER / 'vietnam-petra'
DATA_MEASUREMENTS = DATA_FOLDER / 'mir-models' / \
'20090215-soil-database-mirs.xls'
# DATA_FOLDER = DATA_RAW / 'schmitter_vnm_2010'
# DATA_SPECTRA = DATA_FOLDER / 'mir-models'
# DATA_SPECTRA_REP = DATA_FOLDER / 'vietnam-petra'
# DATA_MEASUREMENTS = DATA_FOLDER / 'mir-models' / \
# '20090215-soil-database-mirs.xls'
#
DATA_SPECTRA, DATA_SPECTRA_REP, DATA_MEASUREMENTS = get_schmitter_config()


def load_spectra_schmitter_vnm(path=DATA_SPECTRA):
def load_spectra(path=DATA_SPECTRA):
""" Returns DRIFT/MIRs spectra, Petra's data, Vietnam, 2007-2008"""
path = Path(path)
df_list = []
Expand All @@ -32,7 +34,7 @@ def load_spectra_schmitter_vnm(path=DATA_SPECTRA):
return pd.concat(df_list, axis=1, ignore_index=False, sort=False).set_index('wavenumber')


def load_spectra_rep_schmitter_vnm(path=DATA_SPECTRA_REP):
def load_spectra_rep(path=DATA_SPECTRA_REP):
""" Returns DRIFT/MIRs spectra and their replicates, Petra's data, Vietnam, 2007-2008"""
path = Path(path)
df_list = []
Expand All @@ -56,21 +58,21 @@ def load_spectra_rep_schmitter_vnm(path=DATA_SPECTRA_REP):
return df.reindex(sorted(df.columns), axis=1)


def load_measurements_schmitter_vnm(path=DATA_MEASUREMENTS):
def load_measurements(path=DATA_MEASUREMENTS):
path = Path(path)
df_labels = pd.read_excel(path, sheet_name='Sheet1', usecols=list(range(2, 13)), na_values='-')
df_labels.columns = ['total_label', 'mir_label', 'TC', 'TOC', 'TIC', 'TN',
'CEC', 'K', 'FCAVER', 'FCIAVER', 'FSAAVER']
return df_labels.set_index('mir_label')


def load_data_schmitter_vnm(path_X=DATA_SPECTRA,
path_y=DATA_MEASUREMENTS):
def load_data(path_X=DATA_SPECTRA,
path_y=DATA_MEASUREMENTS):
""" Returns all available data amenable to DL models as numpy arrays."""
path_X = Path(path_X)
path_y = Path(path_y)
X = load_spectra_schmitter_vnm(path_X)
y = load_measurements_schmitter_vnm(path_y)
X = load_spectra(path_X)
y = load_measurements(path_y)

common_ids = _get_common_ids(X, y)
y = y.loc[common_ids, :]
Expand Down
4 changes: 0 additions & 4 deletions spectrai/datasets/scripts/test.sh

This file was deleted.

Empty file added spectrai/vis/__init__.py
Empty file.
11 changes: 11 additions & 0 deletions spectrai/vis/spectra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy as np
import matplotlib.pyplot as plt


def plot_spectra(X, X_names, figsize=(18, 5), sample=20):
with plt.style.context(('ggplot')):
fig, ax = plt.subplots(figsize=figsize)
idx = np.random.randint(X.shape[0], size=sample)
ax.set_xlim(np.max(X_names), np.min(X_names))
plt.xlabel('Wavenumber')
_ = ax.plot(X_names, X[idx, :].T)

0 comments on commit b5a8141

Please sign in to comment.