Skip to content

Commit

Permalink
Merge pull request #2035 from NNPDF/Jet_commondata
Browse files Browse the repository at this point in the history
Polarised Jet commondata implementation
  • Loading branch information
giacomomagni authored Jul 16, 2024
2 parents 4f4e41d + 9925e4d commit e3f7086
Show file tree
Hide file tree
Showing 173 changed files with 27,704 additions and 88 deletions.
90 changes: 90 additions & 0 deletions nnpdf_data/nnpdf_data/filter_utils/correlations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import numpy as np
from numpy.linalg import eig


def upper_triangular_to_symmetric(ut, dim):
"""Build a symmetric matrix from the upper diagonal"""
corr = np.zeros((dim, dim))
last = dim
first = 0
for i in range(dim):
corr[i, i:] = ut[first:last]
last += dim - i - 1
first += dim - i
return corr


def compute_covmat(corrmat: np.ndarray, unc: np.ndarray, ndata: int) -> list:
"""Compute the covariance matrix with the artificial stat uncertainties."""
# multiply by stat err
cov_mat = np.einsum("i,ij,j->ij", unc, corrmat, unc)
return covmat_to_artunc(ndata, cov_mat.flatten().tolist())


def covmat_to_artunc(ndata, covmat_list, no_of_norm_mat=0):
r"""Convert the covariance matrix to a matrix of
artificial uncertainties.
NOTE: This function has been taken from validphys.newcommondata_utils.
If those utils get merged in the future, we can replace this.
Parameters
----------
ndata : integer
Number of data points
covmat_list : list
A one dimensional list which contains the elements of
the covariance matrix row by row. Since experimental
datasets provide these matrices in a list form, this
simplifies the implementation for the user.
no_of_norm_mat : int
Normalized covariance matrices may have an eigenvalue
of 0 due to the last data point not being linearly
independent. To allow for this, the user should input
the number of normalized matrices that are being treated
in an instance. For example, if a single covariance matrix
of a normalized distribution is being processed, the input
would be 1. If a covariance matrix contains pertains to
3 normalized datasets (i.e. cross covmat for 3
distributions), the input would be 3. The default value is
0 for when the covariance matrix pertains to an absolute
distribution.
Returns
-------
artunc : list
A two dimensional matrix (given as a list of lists)
which contains artificial uncertainties to be added
to the commondata. i^th row (or list) contains the
artificial uncertainties of the i^th data point.
"""
epsilon = -0.0000000001
neg_eval_count = 0
psd_check = True
covmat = np.zeros((ndata, ndata))
artunc = np.zeros((ndata, ndata))
for i in range(len(covmat_list)):
a = i // ndata
b = i % ndata
covmat[a][b] = covmat_list[i]
eigval, eigvec = eig(covmat)
for j in range(len(eigval)):
if eigval[j] < epsilon:
psd_check = False
elif eigval[j] > epsilon and eigval[j] <= 0:
neg_eval_count = neg_eval_count + 1
if neg_eval_count == (no_of_norm_mat + 1):
psd_check = False
elif eigval[j] > 0:
continue
if psd_check == False:
raise ValueError("The covariance matrix is not positive-semidefinite")
else:
for i in range(ndata):
for j in range(ndata):
if eigval[j] < 0:
continue
else:
artunc[i][j] = eigvec[i][j] * np.sqrt(eigval[j])
return artunc.tolist()
27 changes: 27 additions & 0 deletions nnpdf_data/nnpdf_data/filter_utils/uncertainties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

import numpy as np

def symmetrize_errors(delta_plus, delta_minus):
r"""Compute the symmetrized uncertainty and the shift in data point.
Parameters
----------
delta_plus : float
The top/plus uncertainty with sign
delta_minus : float
The bottom/minus uncertainty with sign
Returns
-------
se_delta : float
The value to be added to the data point
se_sigma : float
The symmetrized uncertainty to be used in commondata
"""
semi_diff = (delta_plus + delta_minus) / 2
average = (delta_plus - delta_minus) / 2
se_delta = semi_diff
se_sigma = np.sqrt(average * average + 2 * semi_diff * semi_diff)
return se_delta, se_sigma

Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
HERE = pathlib.Path(__file__).parent
sys.path = [str(HERE.parent / "HERMES_NC_7GEV_EP")] + sys.path

from filter import compute_covmat

from nnpdf_data.filter_utils.correlations import compute_covmat

def read_data(fnames):
df = pd.DataFrame()
Expand Down Expand Up @@ -81,11 +80,9 @@ def write_data(df):
# Extract the correlation matrix and compute artificial systematics
ndata_points = len(data_central)
corrmatrix = read_corrmatrix(nb_datapoints=ndata_points)
# Compute the covariance matrix
compute_covmat(corrmatrix, df, ndata_points)

# Compute the covariance matrix
art_sys = compute_covmat(corrmatrix, df, ndata_points)
art_sys = compute_covmat(corrmatrix, df['stat'], ndata_points)

error = []
for i in range(ndata_points):
Expand Down
84 changes: 2 additions & 82 deletions nnpdf_data/nnpdf_data/new_commondata/HERMES_NC_7GEV_EP/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import pathlib

import numpy as np
from numpy.linalg import eig
import pandas as pd
import yaml

from nnpdf_data.filter_utils.correlations import compute_covmat

def read_data(fnames):
df = pd.DataFrame()
Expand Down Expand Up @@ -49,84 +49,6 @@ def read_corrmatrix(nb_datapoints: int = 15) -> np.ndarray:

return df_corrs.value.values.reshape((nb_datapoints, nb_datapoints))


def covmat_to_artunc(ndata, covmat_list, no_of_norm_mat=0):
r"""Convert the covariance matrix to a matrix of
artificial uncertainties.
NOTE: This function has been taken from validphys.newcommondata_utils.
If those utils get merged in the future, we can replace this.
Parameters
----------
ndata : integer
Number of data points
covmat_list : list
A one dimensional list which contains the elements of
the covariance matrix row by row. Since experimental
datasets provide these matrices in a list form, this
simplifies the implementation for the user.
no_of_norm_mat : int
Normalized covariance matrices may have an eigenvalue
of 0 due to the last data point not being linearly
independent. To allow for this, the user should input
the number of normalized matrices that are being treated
in an instance. For example, if a single covariance matrix
of a normalized distribution is being processed, the input
would be 1. If a covariance matrix contains pertains to
3 normalized datasets (i.e. cross covmat for 3
distributions), the input would be 3. The default value is
0 for when the covariance matrix pertains to an absolute
distribution.
Returns
-------
artunc : list
A two dimensional matrix (given as a list of lists)
which contains artificial uncertainties to be added
to the commondata. i^th row (or list) contains the
artificial uncertainties of the i^th data point.
"""
epsilon = -0.0000000001
neg_eval_count = 0
psd_check = True
covmat = np.zeros((ndata, ndata))
artunc = np.zeros((ndata, ndata))
for i in range(len(covmat_list)):
a = i // ndata
b = i % ndata
covmat[a][b] = covmat_list[i]
eigval, eigvec = eig(covmat)
for j in range(len(eigval)):
if eigval[j] < epsilon:
psd_check = False
elif eigval[j] > epsilon and eigval[j] <= 0:
neg_eval_count = neg_eval_count + 1
if neg_eval_count == (no_of_norm_mat + 1):
psd_check = False
elif eigval[j] > 0:
continue
if psd_check == False:
raise ValueError('The covariance matrix is not positive-semidefinite')
else:
for i in range(ndata):
for j in range(ndata):
if eigval[j] < 0:
continue
else:
artunc[i][j] = eigvec[i][j] * np.sqrt(eigval[j])
return artunc.tolist()


def compute_covmat(corrmat: np.ndarray, df: pd.DataFrame, ndata: int) -> list:
"""Compute the covariance matrix with the artificial stat uncertanties."""
# multiply by stat err
stat = df["stat"]
cov_mat = np.einsum("i,ij,j->ij", stat, corrmat, stat)
return covmat_to_artunc(ndata, cov_mat.flatten().tolist())


def write_data(df):
data_central = []
for i in range(len(df["G"])):
Expand All @@ -153,11 +75,9 @@ def write_data(df):
# Extract the correlation matrix and compute artificial systematics
ndata_points = len(data_central)
corrmatrix = read_corrmatrix(nb_datapoints=ndata_points)
# Compute the covariance matrix
compute_covmat(corrmatrix, df, ndata_points)

# Compute the covariance matrix
art_sys = compute_covmat(corrmatrix, df, ndata_points)
art_sys = compute_covmat(corrmatrix, df['stat'], ndata_points)

error = []
for i in range(ndata_points):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
data_central:
- -0.0014
- -0.0005
- 0.0058
- 0.0034
- 0.0077
- -0.0181
101 changes: 101 additions & 0 deletions nnpdf_data/nnpdf_data/new_commondata/PHENIX_1JET_200GEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import pandas as pd
import yaml

POL_UNC = 0.094


def read_data():
df = pd.DataFrame()

with open("rawdata/Table4.yaml", "r") as file:
data = yaml.safe_load(file)

pTbsub = data["independent_variables"][0]["values"]
pTsub = data["dependent_variables"][0]["values"]
ALLsub = data["dependent_variables"][1]["values"]

for i in range(len(ALLsub)):
df = pd.concat(
[
df,
pd.DataFrame(
{
"pT": [pTsub[i]["value"]],
"pTmin": [pTbsub[i]["low"]],
"pTmax": [pTbsub[i]["high"]],
"eta": [0.0],
"eta_min": [-0.35],
"eta_max": [0.35],
"sqrts": [200],
"ALL": [ALLsub[i]["value"]],
"stat": [ALLsub[i]["errors"][0]["symerror"]],
}
),
],
ignore_index=True,
)

df["pol"] = POL_UNC * abs(df["ALL"])
return df


def write_data(df):
data_central = []
for i in range(len(df["ALL"])):
data_central.append(float(df.loc[i, "ALL"]))

data_central_yaml = {"data_central": data_central}
with open("data.yaml", "w") as file:
yaml.dump(data_central_yaml, file, sort_keys=False)

# Write kin file
kin = []
for i in range(len(df["ALL"])):
kin_value = {
"pT": {
"min": float(df.loc[i, "pTmin"]),
"mid": float(df.loc[i, "pT"]),
"max": float(df.loc[i, "pTmax"]),
},
"sqrts": {"min": None, "mid": float(df.loc[i, "sqrts"]), "max": None},
"eta": {
"min": float(df.loc[i, "eta_min"]),
"mid": float(df.loc[i, "eta"]),
"max": float(df.loc[i, "eta_max"]),
},
}
kin.append(kin_value)

kinematics_yaml = {"bins": kin}

with open("kinematics.yaml", "w") as file:
yaml.dump(kinematics_yaml, file, sort_keys=False)

# Write unc file
error = []
for i in range(len(df)):
e = {"stat": float(df.loc[i, "stat"]), "pol": float(df.loc[i, "pol"])}
error.append(e)

error_definition = {
"stat": {
"description": "statistical uncertainty",
"treatment": "ADD",
"type": "UNCORR",
},
"pol": {
"description": "beam polarization uncertainty",
"treatment": "MULT",
"type": "RHIC2005POL",
},
}

uncertainties_yaml = {"definitions": error_definition, "bins": error}

with open("uncertainties.yaml", "w") as file:
yaml.dump(uncertainties_yaml, file, sort_keys=False)


if __name__ == "__main__":
df = read_data()
write_data(df)
Loading

0 comments on commit e3f7086

Please sign in to comment.