Skip to content

Commit

Permalink
Merge pull request #756 from cms-analysis/rebase_548_onto_112x
Browse files Browse the repository at this point in the history
Cherry-pick #548 merge into 112x
  • Loading branch information
nsmith- authored Apr 8, 2022
2 parents 23a4bb2 + d6a9253 commit e887c1c
Show file tree
Hide file tree
Showing 5 changed files with 225 additions and 3 deletions.
20 changes: 20 additions & 0 deletions data/tutorials/shapes/simple-shapes-df.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
imax 1
jmax 1
kmax *
--------------------------------------------------------------------------------
shapes * * simple-shapes-df_input.csv $CHANNEL:$PROCESS:nominal,sum_w:sum_ww $CHANNEL:$PROCESS:$SYSTEMATIC,sum_w:sum_ww
--------------------------------------------------------------------------------
bin bin1
observation 85
--------------------------------------------------------------------------------
bin bin1 bin1
process signal background
process 0 1
rate 10 100
--------------------------------------------------------------------------------
lumi lnN 1.10 1.0
bgnorm lnN 1.00 1.3
alpha shapeN2 - 1 uncertainty on background shape and normalization
sigma shapeN2 0.5 - uncertainty on signal resolution. Assume the histogram is a 2 sigma shift,
# so divide the unit gaussian by 2 before doing the interpolationkk
--------------------------------------------------------------------------------
89 changes: 89 additions & 0 deletions data/tutorials/shapes/simple-shapes-df_input.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
channel,process,systematic,bin,sum_w,sum_ww
bin1,background,alphaDown,0.0,30.224781036376953,30.224781036376957
bin1,background,alphaDown,1.0,20.260276794433594,20.26027679443359
bin1,background,alphaDown,2.0,13.580870628356934,13.580870628356934
bin1,background,alphaDown,3.0,9.103529930114746,9.103529930114746
bin1,background,alphaDown,4.0,6.102278709411621,6.102278709411621
bin1,background,alphaDown,5.0,4.090479373931885,4.090479373931884
bin1,background,alphaDown,6.0,2.7419304847717285,2.741930484771729
bin1,background,alphaDown,7.0,1.8379708528518677,1.837970852851868
bin1,background,alphaDown,8.0,1.2320287227630615,1.2320287227630617
bin1,background,alphaDown,9.0,0.8258535861968994,0.8258535861968994
bin1,background,alphaDown,10.0,0.0,0.0
bin1,background,alphaUp,0.0,24.10872459411621,24.108724594116207
bin1,background,alphaUp,1.0,19.738554000854492,19.738554000854492
bin1,background,alphaUp,2.0,16.16056251525879,16.16056251525879
bin1,background,alphaUp,3.0,13.231148719787598,13.231148719787598
bin1,background,alphaUp,4.0,10.832748413085938,10.832748413085938
bin1,background,alphaUp,5.0,8.869104385375977,8.869104385375975
bin1,background,alphaUp,6.0,7.26140832901001,7.261408329010011
bin1,background,alphaUp,7.0,5.945138454437256,5.945138454437256
bin1,background,alphaUp,8.0,4.867467403411865,4.867467403411864
bin1,background,alphaUp,9.0,3.985145330429077,3.985145330429077
bin1,background,alphaUp,10.0,0.0,0.0
bin1,background,nominal,0.0,27.27617835998535,27.276178359985355
bin1,background,nominal,1.0,20.20669174194336,20.206691741943363
bin1,background,nominal,2.0,14.96948528289795,14.969485282897947
bin1,background,nominal,3.0,11.089667320251465,11.089667320251465
bin1,background,nominal,4.0,8.21542739868164,8.21542739868164
bin1,background,nominal,5.0,6.0861382484436035,6.086138248443604
bin1,background,nominal,6.0,4.508721828460693,4.508721828460693
bin1,background,nominal,7.0,3.3401434421539307,3.3401434421539307
bin1,background,nominal,8.0,2.4744391441345215,2.474439144134521
bin1,background,nominal,9.0,1.8331094980239868,1.833109498023987
bin1,background,nominal,10.0,0.0,0.0
bin1,data_obs,nominal,0.0,22.0,0.0
bin1,data_obs,nominal,1.0,14.0,14.0
bin1,data_obs,nominal,2.0,16.0,16.0
bin1,data_obs,nominal,3.0,13.0,12.999999999999998
bin1,data_obs,nominal,4.0,5.0,5.000000000000001
bin1,data_obs,nominal,5.0,5.0,5.000000000000001
bin1,data_obs,nominal,6.0,5.0,5.000000000000001
bin1,data_obs,nominal,7.0,3.0,2.9999999999999996
bin1,data_obs,nominal,8.0,2.0,2.0000000000000004
bin1,data_obs,nominal,9.0,0.0,0.0
bin1,data_obs,nominal,10.0,0.0,0.0
bin1,data_sig,nominal,0.0,30.0,0.0
bin1,data_sig,nominal,1.0,13.0,12.999999999999998
bin1,data_sig,nominal,2.0,13.0,12.999999999999998
bin1,data_sig,nominal,3.0,10.0,10.000000000000002
bin1,data_sig,nominal,4.0,11.0,11.0
bin1,data_sig,nominal,5.0,13.0,12.999999999999998
bin1,data_sig,nominal,6.0,6.0,5.999999999999999
bin1,data_sig,nominal,7.0,6.0,5.999999999999999
bin1,data_sig,nominal,8.0,3.0,2.9999999999999996
bin1,data_sig,nominal,9.0,2.0,2.0000000000000004
bin1,data_sig,nominal,10.0,0.0,0.0
bin1,signal,nominal,0.0,1.0769933851406677e-06,1.0769933851406677e-06
bin1,signal,nominal,1.0,0.0001598399830982089,0.00015983998309820893
bin1,signal,nominal,2.0,0.008726967498660088,0.008726967498660088
bin1,signal,nominal,3.0,0.17528583109378815,0.17528583109378812
bin1,signal,nominal,4.0,1.295196771621704,1.295196771621704
bin1,signal,nominal,5.0,3.520709991455078,3.5207099914550777
bin1,signal,nominal,6.0,3.520709991455078,3.5207099914550777
bin1,signal,nominal,7.0,1.295196771621704,1.295196771621704
bin1,signal,nominal,8.0,0.17528583109378815,0.17528583109378812
bin1,signal,nominal,9.0,0.008726967498660088,0.008726967498660088
bin1,signal,nominal,10.0,0.0,0.0
bin1,signal,sigmaDown,0.0,2.240517341968451e-13,2.2405173419684507e-13
bin1,signal,sigmaDown,1.0,6.052358614283548e-09,6.052358614283548e-09
bin1,signal,sigmaDown,2.0,2.124152706528548e-05,2.1241527065285485e-05
bin1,signal,sigmaDown,3.0,0.009685711935162544,0.009685711935162543
bin1,signal,sigmaDown,4.0,0.5738020539283752,0.5738020539283751
bin1,signal,sigmaDown,5.0,4.4164910316467285,4.4164910316467285
bin1,signal,sigmaDown,6.0,4.4164910316467285,4.4164910316467285
bin1,signal,sigmaDown,7.0,0.5738020539283752,0.5738020539283751
bin1,signal,sigmaDown,8.0,0.009685711935162544,0.009685711935162543
bin1,signal,sigmaDown,9.0,2.124152706528548e-05,2.1241527065285485e-05
bin1,signal,sigmaDown,10.0,0.0,0.0
bin1,signal,sigmaUp,0.0,0.006812803912907839,0.00681280391290784
bin1,signal,sigmaUp,1.0,0.048034943640232086,0.048034943640232086
bin1,signal,sigmaUp,2.0,0.22916190326213837,0.22916190326213834
bin1,signal,sigmaUp,3.0,0.739743709564209,0.739743709564209
bin1,signal,sigmaUp,4.0,1.6157487630844116,1.6157487630844116
bin1,signal,sigmaUp,5.0,2.3879218101501465,2.3879218101501465
bin1,signal,sigmaUp,6.0,2.3879218101501465,2.3879218101501465
bin1,signal,sigmaUp,7.0,1.6157487630844116,1.6157487630844116
bin1,signal,sigmaUp,8.0,0.739743709564209,0.739743709564209
bin1,signal,sigmaUp,9.0,0.22916190326213837,0.22916190326213834
bin1,signal,sigmaUp,10.0,0.0,0.0
2 changes: 1 addition & 1 deletion docs/part2/settinguptheanalysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ For each channel, histograms have to be provided for the observed shape and for
- The normalization of the data histogram must correspond to the number of observed events
- The normalization of the expected histograms must match the expected yields

The combine tool can take as input histograms saved as TH1 or as RooAbsHist in a RooFit workspace (an example of how to create a RooFit workspace and save histograms is available in [github](https://github.com/cms-analysis/HiggsAnalysis-CombinedLimit/blob/81x-root606/data/benchmarks/shapes/make_simple_shapes.cxx)).
The combine tool can take as input histograms saved as TH1, as RooAbsHist in a RooFit workspace (an example of how to create a RooFit workspace and save histograms is available in [github](https://github.com/cms-analysis/HiggsAnalysis-CombinedLimit/blob/81x-root606/data/benchmarks/shapes/make_simple_shapes.cxx)), or from a pandas dataframe ([example](https://github.com/cms-analysis/HiggsAnalysis-CombinedLimit/blob/102x/data/tutorials/shapes/simple-shapes-df.txt))

The block of lines defining the mapping (first block in the datacard) contains one or more rows in the form

Expand Down
102 changes: 102 additions & 0 deletions python/DataFrameWrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import numpy as np
import pandas as pd
import ROOT

class DataFrameWrapper(object):
th1_class = "TH1F"
def __init__(self, path, ext, load=True):
"""
path: file system path to the dataframe on disk
ext: file extension used to interpret the file based on pandas IO
load: skip loading the dataframe if False
"""
self.path = path
self.ext = ext

self.read_args = []
self.read_kwargs = {}
if load:
self.df = self.load_dataframe()

def load_dataframe(self):
"""
Use pandas IO tools to load a dataframe from any of the following:
["csv", "json", "html", "pkl", "xlsx", "h5", "parquet"]
"""
# Index all columns apart from the last 2 (taken as sum_w and sum_ww)
# for csv, json, html and xlsx
if self.ext == ".csv":
df = pd.read_csv(self.path, *self.read_args, **self.read_kwargs)
return df.set_index(df.columns.tolist()[:-2])
elif self.ext == ".json":
df = pd.read_json(self.path, *self.read_args, **self.read_kwargs)
return df.set_index(df.columns.tolist()[:-2])
elif self.ext == ".html":
df = pd.read_html(self.path, *self.read_args, **self.read_kwargs)
return df.set_index(df.columns.tolist()[:-2])
elif self.ext == ".pkl":
return pd.read_pickle(self.path, *self.read_args, **self.read_kwargs)
elif self.ext == ".xlsx":
if ":" in self.path:
filepath, sheetname = self.path.split(":")
else:
filepath, sheetname = self.path, 0

# read in columns first
cols = pd.read_excel(
self.path, sheetname, *self.read_args, **self.read_kwargs
).columns.tolist()
return pd.read_excel(
self.path, sheetname, index_col=list(range(len(cols)-2)),
*self.read_args, **self.read_kwargs
)
elif self.ext == ".h5":
filepath, internalpath = self.path.split(":")
return pd.read_hdf(filepath, internalpath, *self.read_args, **self.read_kwargs)
elif self.ext == ".parquet":
return pd.read_parquet(self.path, *self.read_args, **self.read_kwargs)
return None

def Get(self, object_name):
"""
Mimic ROOT file Get function to return a ROOT.TH1.
object_name: dataframe index/columns selection split by ":" (used in .loc)
e.g. "125:bin1:signal:sigmaUp,event_count:event_variance" to select
the index (125, bin1, signal, sigmaUp) and columns (event_count,
event_variance)
"""
if not hasattr(self, 'df'):
raise AttributeError("Dataframe has not been loaded")

index_labels, column_labels = object_name.split(",")

# Try to cast index_labels into self.df.index dtypes. Users can only
# input index_labels as a string, but the dataframe might have other
# dtypes (e.g. int, float, ...)
selection = [
np.array([index_labels.split(":")[idx]]).astype(
self.df.index.get_level_values(idx).dtype
)[0] for idx in range(len(index_labels.split(":")))
]

df_hist = self.df.loc[tuple(selection), column_labels.split(":")]

# index name used for th1 name
df_hist.index.names = [index_labels.replace(":","_")]
return self.convert_to_th1(df_hist, self.th1_class)

@staticmethod
def convert_to_th1(df, th1_class):
"""
Receive a dataframe and convert it to a TH1. Index is taken as the
binning for labelling. Last bin is overflow.
"""
name = df.index.names[0]
nbins = df.shape[0]-1
th1 = getattr(ROOT, th1_class)(name, name, nbins, 0., float(nbins))
for i in range(nbins+1):
sum_w, sum_ww = df.iloc[i]
th1.SetBinContent(i+1, sum_w)
th1.SetBinError(i+1, np.sqrt(sum_ww))
return th1
15 changes: 13 additions & 2 deletions python/ShapeTools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import ROOT
from collections import defaultdict
from math import *
from .DataFrameWrapper import DataFrameWrapper

RooArgSet_add_original = ROOT.RooArgSet.add
def RooArgSet_add_patched(self, obj, *args, **kwargs):
Expand Down Expand Up @@ -34,7 +35,15 @@ def __getitem__(self, fname):
trueFName = fname
if not os.path.exists(trueFName) and not os.path.isabs(trueFName) and os.path.exists(self._basedir+"/"+trueFName):
trueFName = self._basedir+"/"+trueFName;
self._files[fname] = [ ROOT.TFile.Open(trueFName), self._total ]
# interpret file from extension - csv, json, html, pkl, xlsx, h5, parquet
filepath = trueFName.split(":")[0]
filename, ext = os.path.splitext(filepath)
if ext in [".csv", ".json", ".html", ".pkl", ".xlsx", ".h5", ".parquet"]:
filehandle = DataFrameWrapper(trueFName, ext)
else:
# fallback to ROOT file
filehandle = ROOT.TFile.Open(trueFName)
self._files[fname] = [ filehandle, self._total ]
else:
self._files[fname][1] = self._total
self._hits[fname] += 1
Expand Down Expand Up @@ -512,7 +521,9 @@ def getShape(self,channel,process,syst="",_cache={},allowNoSyst=False):
finalNames = [ fn.replace("$%s"%mpname,mpv) for fn in finalNames ]
file = self._fileCache[finalNames[0]]; objname = finalNames[1]
if not file: raise RuntimeError, "Cannot open file %s (from pattern %s)" % (finalNames[0],names[0])
if ":" in objname: # workspace:obj or ttree:xvar or th1::xvar

# follow histogram routine if file is a dataframe and load dataframe as histograms
if ":" in objname and not isinstance(file, DataFrameWrapper): # workspace:obj or ttree:xvar or th1::xvar
(wname, oname) = objname.split(":")
if (file,wname) not in self.wspnames :
self.wspnames[(file,wname)] = file.Get(wname)
Expand Down

0 comments on commit e887c1c

Please sign in to comment.