diff --git a/event_files/delphes/hh.yaml b/event_files/delphes/hh.yaml new file mode 100644 index 0000000..e585d18 --- /dev/null +++ b/event_files/delphes/hh.yaml @@ -0,0 +1,53 @@ +# --------------------------------------------------- +# REQUIRED - INPUTS - List all inputs to SPANet here. +# --------------------------------------------------- +INPUTS: + # ----------------------------------------------------------------------------- + # REQUIRED - SEQUENTIAL - inputs which can have an arbitrary number of vectors. + # ----------------------------------------------------------------------------- + SEQUENTIAL: + Jets: + pt: log_normalize + eta: normalize + sinphi: none + cosphi: none + btag: none + # --------------------------------------------------------------------- + # REQUIRED - GLOBAL - inputs which will have a single vector per event. + # --------------------------------------------------------------------- + GLOBAL: + + +# ---------------------------------------------------------------------- +# REQUIRED - EVENT - Complete list of resonance particles and daughters. +# ---------------------------------------------------------------------- +EVENT: + h1: + - b1: Jets + - b2: Jets + h2: + - b1: Jets + - b2: Jets + +# --------------------------------------------------------- +# REQUIRED KEY - PERMUTATIONS - List of valid permutations. +# --------------------------------------------------------- +PERMUTATIONS: + EVENT: + - [ h1, h2 ] + h1: + - [ b1, b2 ] + h2: + - [ b1, b2 ] + + +# ------------------------------------------------------------------------------ +# REQUIRED - REGRESSIONS - List of desired features to regress from observables. +# ------------------------------------------------------------------------------ +REGRESSIONS: + + +# ----------------------------------------------------------------------------- +# REQUIRED - REGRESSIONS - List of desired classes to predict from observables. +# ----------------------------------------------------------------------------- +CLASSIFICATIONS: diff --git a/event_files/delphes/hh_boosted.yaml b/event_files/delphes/hh_boosted.yaml new file mode 100644 index 0000000..62654bc --- /dev/null +++ b/event_files/delphes/hh_boosted.yaml @@ -0,0 +1,70 @@ +# --------------------------------------------------- +# REQUIRED - INPUTS - List all inputs to SPANet here. +# --------------------------------------------------- +INPUTS: + # ----------------------------------------------------------------------------- + # REQUIRED - SEQUENTIAL - inputs which can have an arbitrary number of vectors. + # ----------------------------------------------------------------------------- + SEQUENTIAL: + Jets: + pt: log_normalize + eta: normalize + sinphi: none + cosphi: none + btag: none + mass: normalize + BoostedJets: + fj_pt: log_normalize + fj_eta: normalize + fj_sinphi: none + fj_cosphi: none + fj_mass: normalize + fj_sdmass: normalize + fj_nsub: normalize + fj_tau21: none + fj_tau32: none + fj_ptd: none + fj_ncharged: normalize + # --------------------------------------------------------------------- + # REQUIRED - GLOBAL - inputs which will have a single vector per event. + # --------------------------------------------------------------------- + GLOBAL: + + +# ---------------------------------------------------------------------- +# REQUIRED - EVENT - Complete list of resonance particles and daughters. +# ---------------------------------------------------------------------- +EVENT: + h1: + - b1: Jets + - b2: Jets + h2: + - b1: Jets + - b2: Jets + bh1: + - bb: BoostedJets + bh2: + - bb: BoostedJets + +# --------------------------------------------------------- +# REQUIRED KEY - PERMUTATIONS - List of valid permutations. +# --------------------------------------------------------- +PERMUTATIONS: + EVENT: + - [[h1, h2], [bh1, bh2]] + h1: + - [ b1, b2 ] + h2: + - [ b1, b2 ] + + +# ------------------------------------------------------------------------------ +# REQUIRED - REGRESSIONS - List of desired features to regress from observables. +# ------------------------------------------------------------------------------ +REGRESSIONS: + + +# ----------------------------------------------------------------------------- +# REQUIRED - REGRESSIONS - List of desired classes to predict from observables. +# ----------------------------------------------------------------------------- +CLASSIFICATIONS: diff --git a/options_files/delphes/hh.json b/options_files/delphes/hh.json new file mode 100644 index 0000000..03868ff --- /dev/null +++ b/options_files/delphes/hh.json @@ -0,0 +1,54 @@ +{ + "hidden_dim": 32, + "transformer_dim": 32, + "initial_embedding_dim": 16, + "position_embedding_dim": 16, + "num_embedding_layers": 10, + "num_encoder_layers": 4, + "num_branch_embedding_layers": 3, + "num_branch_encoder_layers": 3, + "num_jet_embedding_layers": 0, + "num_jet_encoder_layers": 2, + "num_detector_layers": 2, + "num_regression_layers": 3, + "num_classification_layers": 3, + "split_symmetric_attention": true, + "num_attention_heads": 4, + "transformer_activation": "gelu", + "linear_block_type": "GRU", + "transformer_type": "Gated", + "linear_activation": "gelu", + "normalization": "LayerNorm", + "masking": "Filling", + "skip_connections": true, + "initial_embedding_skip_connections": true, + "event_info_file": "event_files/delphes/hh.yaml", + "training_file": "data/hh_training.h5", + "normalize_features": true, + "limit_to_num_jets": 0, + "balance_jets": false, + "partial_events": true, + "balance_particles": true, + "dataset_limit": 1.0, + "train_validation_split": 0.95, + "batch_size": 4096, + "num_dataloader_workers": 4, + "mask_sequence_vectors": true, + "combine_pair_loss": "min", + "optimizer": "AdamW", + "focal_gamma": 0.0, + "learning_rate": 0.0015, + "learning_rate_cycles": 1, + "learning_rate_warmup_epochs": 1.0, + "assignment_loss_scale": 1.0, + "detection_loss_scale": 0.0, + "kl_loss_scale": 0.0, + "regression_loss_scale": 0.0, + "classification_loss_scale": 0.0, + "l2_penalty": 0.0002, + "gradient_clip": 0.0, + "dropout": 0.2, + "epochs": 250, + "num_gpu": 1, + "verbose_output": true +} diff --git a/src/data/cms/convert_to_h5.py b/src/data/cms/convert_to_h5.py index 14ea931..6019d62 100644 --- a/src/data/cms/convert_to_h5.py +++ b/src/data/cms/convert_to_h5.py @@ -14,11 +14,9 @@ logging.basicConfig(level=logging.INFO) N_JETS = 10 -N_FJETS = 3 N_MASSES = 45 MIN_JET_PT = 20 MIN_FJET_PT = 200 -MIN_JETS = 6 MIN_MASS = 50 PROJECT_DIR = Path(__file__).resolve().parents[3] @@ -33,7 +31,7 @@ def get_n_features(name, events, iterator): ) -def get_datasets(events): +def get_datasets(events, n_higgs): # noqa: C901 # small-radius jet info pt = get_n_features("jet{i}Pt", events, range(1, N_JETS + 1)) ptcorr = get_n_features("jet{i}PtCorr", events, range(1, N_JETS + 1)) @@ -50,21 +48,23 @@ def get_datasets(events): mass = get_n_features("mass{i}", events, range(N_MASSES)) # large-radius jet info - fj_pt = get_n_features("fatJet{i}Pt", events, range(1, N_FJETS + 1)) - fj_eta = get_n_features("fatJet{i}Eta", events, range(1, N_FJETS + 1)) - fj_phi = get_n_features("fatJet{i}Phi", events, range(1, N_FJETS + 1)) - fj_mass = get_n_features("fatJet{i}Mass", events, range(1, N_FJETS + 1)) - fj_sdmass = get_n_features("fatJet{i}MassSD", events, range(1, N_FJETS + 1)) - fj_regmass = get_n_features("fatJet{i}MassRegressed", events, range(1, N_FJETS + 1)) - fj_nsub = get_n_features("fatJet{i}NSubJets", events, range(1, N_FJETS + 1)) - fj_tau32 = get_n_features("fatJet{i}Tau3OverTau2", events, range(1, N_FJETS + 1)) - fj_xbb = get_n_features("fatJet{i}PNetXbb", events, range(1, N_FJETS + 1)) - fj_xqq = get_n_features("fatJet{i}PNetXjj", events, range(1, N_FJETS + 1)) - fj_qcd = get_n_features("fatJet{i}PNetQCD", events, range(1, N_FJETS + 1)) - fj_higgs_idx = get_n_features("fatJet{i}HiggsMatchedIndex", events, range(1, N_FJETS + 1)) - - # keep events with >= MIN_JETS small-radius jets - mask = ak.num(pt[pt > MIN_JET_PT]) >= MIN_JETS + n_fjets = n_higgs + fj_pt = get_n_features("fatJet{i}Pt", events, range(1, n_fjets + 1)) + fj_eta = get_n_features("fatJet{i}Eta", events, range(1, n_fjets + 1)) + fj_phi = get_n_features("fatJet{i}Phi", events, range(1, n_fjets + 1)) + fj_mass = get_n_features("fatJet{i}Mass", events, range(1, n_fjets + 1)) + fj_sdmass = get_n_features("fatJet{i}MassSD", events, range(1, n_fjets + 1)) + fj_regmass = get_n_features("fatJet{i}MassRegressed", events, range(1, n_fjets + 1)) + fj_nsub = get_n_features("fatJet{i}NSubJets", events, range(1, n_fjets + 1)) + fj_tau32 = get_n_features("fatJet{i}Tau3OverTau2", events, range(1, n_fjets + 1)) + fj_xbb = get_n_features("fatJet{i}PNetXbb", events, range(1, n_fjets + 1)) + fj_xqq = get_n_features("fatJet{i}PNetXjj", events, range(1, n_fjets + 1)) + fj_qcd = get_n_features("fatJet{i}PNetQCD", events, range(1, n_fjets + 1)) + fj_higgs_idx = get_n_features("fatJet{i}HiggsMatchedIndex", events, range(1, n_fjets + 1)) + + # keep events with >= min_jets small-radius jets + min_jets = 2 * n_higgs + mask = ak.num(pt[pt > MIN_JET_PT]) >= min_jets pt = pt[mask] ptcorr = ptcorr[mask] eta = eta[mask] @@ -105,52 +105,57 @@ def get_datasets(events): # index of small-radius jet if Higgs is reconstructed h1_bs = ak.local_index(higgs_idx)[higgs_idx == 1] h2_bs = ak.local_index(higgs_idx)[higgs_idx == 2] - h3_bs = ak.local_index(higgs_idx)[higgs_idx == 3] + if n_higgs == 3: + h3_bs = ak.local_index(higgs_idx)[higgs_idx == 3] # index of large-radius jet if Higgs is reconstructed h1_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 1] h2_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 2] - h3_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 3] + if n_higgs == 3: + h3_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 3] # check/fix small-radius jet truth (ensure max 2 small-radius jets per higgs) - check = ( - np.unique(ak.count(h1_bs, axis=-1)).to_list() - + np.unique(ak.count(h2_bs, axis=-1)).to_list() - + np.unique(ak.count(h3_bs, axis=-1)).to_list() - ) + check = np.unique(ak.count(h1_bs, axis=-1)).to_list() + np.unique(ak.count(h2_bs, axis=-1)).to_list() + if n_higgs == 3: + check += np.unique(ak.count(h3_bs, axis=-1)).to_list() + if 3 in check: logging.warning("some Higgs bosons match to 3 small-radius jets! Check truth") # check/fix large-radius jet truth (ensure max 1 large-radius jet per higgs) - fj_check = ( - np.unique(ak.count(h1_bb, axis=-1)).to_list() - + np.unique(ak.count(h2_bb, axis=-1)).to_list() - + np.unique(ak.count(h3_bb, axis=-1)).to_list() - ) + fj_check = np.unique(ak.count(h1_bb, axis=-1)).to_list() + np.unique(ak.count(h2_bb, axis=-1)).to_list() + if n_higgs == 3: + fj_check += np.unique(ak.count(h3_bb, axis=-1)).to_list() + if 2 in fj_check: logging.warning("some Higgs bosons match to 2 large-radius jets! Check truth") h1_bs = ak.fill_none(ak.pad_none(h1_bs, 2, clip=True), -1) h2_bs = ak.fill_none(ak.pad_none(h2_bs, 2, clip=True), -1) - h3_bs = ak.fill_none(ak.pad_none(h3_bs, 2, clip=True), -1) + if n_higgs == 3: + h3_bs = ak.fill_none(ak.pad_none(h3_bs, 2, clip=True), -1) h1_bb = ak.fill_none(ak.pad_none(h1_bb, 1, clip=True), -1) h2_bb = ak.fill_none(ak.pad_none(h2_bb, 1, clip=True), -1) - h3_bb = ak.fill_none(ak.pad_none(h3_bb, 1, clip=True), -1) + if n_higgs == 3: + h3_bb = ak.fill_none(ak.pad_none(h3_bb, 1, clip=True), -1) h1_b1, h1_b2 = h1_bs[:, 0], h1_bs[:, 1] h2_b1, h2_b2 = h2_bs[:, 0], h2_bs[:, 1] - h3_b1, h3_b2 = h3_bs[:, 0], h3_bs[:, 1] + if n_higgs == 3: + h3_b1, h3_b2 = h3_bs[:, 0], h3_bs[:, 1] # mask whether Higgs can be reconstructed as 2 small-radius jet h1_mask = ak.all(h1_bs != -1, axis=-1) h2_mask = ak.all(h2_bs != -1, axis=-1) - h3_mask = ak.all(h3_bs != -1, axis=-1) + if n_higgs == 3: + h3_mask = ak.all(h3_bs != -1, axis=-1) # mask whether Higgs can be reconstructed as 1 large-radius jet h1_fj_mask = ak.all(h1_bb != -1, axis=-1) h2_fj_mask = ak.all(h2_bb != -1, axis=-1) - h3_fj_mask = ak.all(h3_bb != -1, axis=-1) + if n_higgs == 3: + h3_fj_mask = ak.all(h3_bb != -1, axis=-1) datasets = {} datasets["INPUTS/Jets/MASK"] = mask.to_numpy() @@ -193,9 +198,10 @@ def get_datasets(events): datasets["TARGETS/h2/b1"] = h2_b1.to_numpy() datasets["TARGETS/h2/b2"] = h2_b2.to_numpy() - datasets["TARGETS/h3/mask"] = h3_mask.to_numpy() - datasets["TARGETS/h3/b1"] = h3_b1.to_numpy() - datasets["TARGETS/h3/b2"] = h3_b2.to_numpy() + if n_higgs == 3: + datasets["TARGETS/h3/mask"] = h3_mask.to_numpy() + datasets["TARGETS/h3/b1"] = h3_b1.to_numpy() + datasets["TARGETS/h3/b2"] = h3_b2.to_numpy() datasets["TARGETS/bh1/mask"] = h1_fj_mask.to_numpy() datasets["TARGETS/bh1/bb"] = h1_bb.to_numpy().reshape(h1_fj_mask.to_numpy().shape) @@ -203,8 +209,9 @@ def get_datasets(events): datasets["TARGETS/bh2/mask"] = h2_fj_mask.to_numpy() datasets["TARGETS/bh2/bb"] = h2_bb.to_numpy().reshape(h2_fj_mask.to_numpy().shape) - datasets["TARGETS/bh3/mask"] = h3_fj_mask.to_numpy() - datasets["TARGETS/bh3/bb"] = h3_bb.to_numpy().reshape(h3_fj_mask.to_numpy().shape) + if n_higgs == 3: + datasets["TARGETS/bh3/mask"] = h3_fj_mask.to_numpy() + datasets["TARGETS/bh3/bb"] = h3_bb.to_numpy().reshape(h3_fj_mask.to_numpy().shape) return datasets @@ -213,7 +220,14 @@ def get_datasets(events): @click.argument("in-files", nargs=-1) @click.option("--out-file", default=f"{PROJECT_DIR}/data/cms/hhh_training.h5", help="Output file.") @click.option("--train-frac", default=0.95, help="Fraction for training.") -def main(in_files, out_file, train_frac): +@click.option( + "--n-higgs", + "n_higgs", + default=3, + type=click.IntRange(2, 3), + help="Number of Higgs bosons per event", +) +def main(in_files, out_file, train_frac, n_higgs): all_datasets = {} for file_name in in_files: with uproot.open(file_name) as in_file: @@ -232,7 +246,7 @@ def main(in_files, out_file, train_frac): schemaclass=BaseSchema, ).events() - datasets = get_datasets(events) + datasets = get_datasets(events, n_higgs) for dataset_name, data in datasets.items(): if dataset_name not in all_datasets: all_datasets[dataset_name] = [] diff --git a/src/data/delphes/convert_to_h5.py b/src/data/delphes/convert_to_h5.py index 5fd3898..3249057 100644 --- a/src/data/delphes/convert_to_h5.py +++ b/src/data/delphes/convert_to_h5.py @@ -21,10 +21,8 @@ logging.basicConfig(level=logging.INFO) N_JETS = 10 -N_FJETS = 3 MIN_JET_PT = 20 MIN_FJET_PT = 200 -MIN_JETS = 6 PROJECT_DIR = Path(__file__).resolve().parents[3] @@ -32,51 +30,51 @@ def to_np_array(ak_array, max_n=10, pad=0): return ak.fill_none(ak.pad_none(ak_array, max_n, clip=True, axis=-1), pad).to_numpy() -def get_datasets(arrays): +def get_datasets(arrays, n_higgs): # noqa: C901 part_pid = arrays["Particle/Particle.PID"] # PDG ID part_m1 = arrays["Particle/Particle.M1"] # note: see some +/-15 PDG ID particles (taus) so h->tautau is turned on - # explicitly mask these events out, just keeping hhh6b events - condition_hhh6b = np.logical_and(np.abs(part_pid) == 5, part_pid[part_m1] == 25) - mask_hhh6b = ak.count(part_pid[condition_hhh6b], axis=-1) == 6 - part_pid = part_pid[mask_hhh6b] - part_pt = arrays["Particle/Particle.PT"][mask_hhh6b] - part_eta = arrays["Particle/Particle.Eta"][mask_hhh6b] - part_phi = arrays["Particle/Particle.Phi"][mask_hhh6b] - part_mass = arrays["Particle/Particle.Mass"][mask_hhh6b] - part_m1 = arrays["Particle/Particle.M1"][mask_hhh6b] - part_d1 = arrays["Particle/Particle.D1"][mask_hhh6b] + # explicitly mask these events out, just keeping hbb events + condition_hbb = np.logical_and(np.abs(part_pid) == 5, part_pid[part_m1] == 25) + mask_hbb = ak.count(part_pid[condition_hbb], axis=-1) == 2 * n_higgs + part_pid = part_pid[mask_hbb] + part_pt = arrays["Particle/Particle.PT"][mask_hbb] + part_eta = arrays["Particle/Particle.Eta"][mask_hbb] + part_phi = arrays["Particle/Particle.Phi"][mask_hbb] + part_mass = arrays["Particle/Particle.Mass"][mask_hbb] + part_m1 = arrays["Particle/Particle.M1"][mask_hbb] + part_d1 = arrays["Particle/Particle.D1"][mask_hbb] # small-radius jet info - pt = arrays["Jet/Jet.PT"][mask_hhh6b] - eta = arrays["Jet/Jet.Eta"][mask_hhh6b] - phi = arrays["Jet/Jet.Phi"][mask_hhh6b] - mass = arrays["Jet/Jet.Mass"][mask_hhh6b] - btag = arrays["Jet/Jet.BTag"][mask_hhh6b] - flavor = arrays["Jet/Jet.Flavor"][mask_hhh6b] + pt = arrays["Jet/Jet.PT"][mask_hbb] + eta = arrays["Jet/Jet.Eta"][mask_hbb] + phi = arrays["Jet/Jet.Phi"][mask_hbb] + mass = arrays["Jet/Jet.Mass"][mask_hbb] + btag = arrays["Jet/Jet.BTag"][mask_hbb] + flavor = arrays["Jet/Jet.Flavor"][mask_hbb] # large-radius jet info - fj_pt = arrays["FatJet/FatJet.PT"][mask_hhh6b] - fj_eta = arrays["FatJet/FatJet.Eta"][mask_hhh6b] - fj_phi = arrays["FatJet/FatJet.Phi"][mask_hhh6b] - fj_mass = arrays["FatJet/FatJet.Mass"][mask_hhh6b] - fj_sdp4 = arrays["FatJet/FatJet.SoftDroppedP4[5]"][mask_hhh6b] + fj_pt = arrays["FatJet/FatJet.PT"][mask_hbb] + fj_eta = arrays["FatJet/FatJet.Eta"][mask_hbb] + fj_phi = arrays["FatJet/FatJet.Phi"][mask_hbb] + fj_mass = arrays["FatJet/FatJet.Mass"][mask_hbb] + fj_sdp4 = arrays["FatJet/FatJet.SoftDroppedP4[5]"][mask_hbb] # first entry (i = 0) is the total SoftDropped Jet 4-momenta # from i = 1 to 4 are the pruned subjets 4-momenta fj_sdmass2 = ( fj_sdp4.fE[..., 0] ** 2 - fj_sdp4.fP.fX[..., 0] ** 2 - fj_sdp4.fP.fY[..., 0] ** 2 - fj_sdp4.fP.fZ[..., 0] ** 2 ) fj_sdmass = np.sqrt(np.maximum(fj_sdmass2, 0)) - fj_taus = arrays["FatJet/FatJet.Tau[5]"][mask_hhh6b] + fj_taus = arrays["FatJet/FatJet.Tau[5]"][mask_hbb] # just saving just tau21 and tau32, can save others if useful fj_tau21 = np.nan_to_num(fj_taus[..., 1] / fj_taus[..., 0], nan=-1) fj_tau32 = np.nan_to_num(fj_taus[..., 2] / fj_taus[..., 1], nan=-1) - fj_charge = arrays["FatJet/FatJet.Charge"][mask_hhh6b] - fj_ehadovereem = arrays["FatJet/FatJet.EhadOverEem"][mask_hhh6b] - fj_neutralenergyfrac = arrays["FatJet/FatJet.NeutralEnergyFraction"][mask_hhh6b] - fj_chargedenergyfrac = arrays["FatJet/FatJet.ChargedEnergyFraction"][mask_hhh6b] - fj_nneutral = arrays["FatJet/FatJet.NNeutrals"][mask_hhh6b] - fj_ncharged = arrays["FatJet/FatJet.NCharged"][mask_hhh6b] + fj_charge = arrays["FatJet/FatJet.Charge"][mask_hbb] + fj_ehadovereem = arrays["FatJet/FatJet.EhadOverEem"][mask_hbb] + fj_neutralenergyfrac = arrays["FatJet/FatJet.NeutralEnergyFraction"][mask_hbb] + fj_chargedenergyfrac = arrays["FatJet/FatJet.ChargedEnergyFraction"][mask_hbb] + fj_nneutral = arrays["FatJet/FatJet.NNeutrals"][mask_hbb] + fj_ncharged = arrays["FatJet/FatJet.NCharged"][mask_hbb] particles = ak.zip( { @@ -124,8 +122,9 @@ def get_datasets(arrays): matched_fj_idx = match_fjet_to_jet(fjets, jets, ak.ArrayBuilder()).snapshot() fj_higgs_idx = match_higgs_to_fjet(higgses, bquarks, fjets, ak.ArrayBuilder()).snapshot() - # keep events with >= MIN_JETS small-radius jets - mask_minjets = ak.num(pt[pt > MIN_JET_PT]) >= MIN_JETS + # keep events with >= min_jets small-radius jets + min_jets = 2 * n_higgs + mask_minjets = ak.num(pt[pt > MIN_JET_PT]) >= min_jets # sort by btag first, then pt sorted_by_pt = ak.argsort(pt, ascending=False, axis=-1) sorted = ak.concatenate([sorted_by_pt[btag == 1], sorted_by_pt[btag == 0]], axis=-1) @@ -165,21 +164,22 @@ def get_datasets(arrays): fj_ncharged = fj_ncharged[sorted_by_fj_pt][mask_minjets] fj_higgs_idx = fj_higgs_idx[sorted_by_fj_pt][mask_minjets] - # keep only top N_FJETS - fj_pt = fj_pt[:, :N_FJETS] - fj_eta = fj_eta[:, :N_FJETS] - fj_phi = fj_phi[:, :N_FJETS] - fj_mass = fj_mass[:, :N_FJETS] - fj_sdmass = fj_sdmass[:, :N_FJETS] - fj_tau21 = fj_tau21[:, :N_FJETS] - fj_tau32 = fj_tau32[:, :N_FJETS] - fj_charge = fj_charge[:, :N_FJETS] - fj_ehadovereem = fj_ehadovereem[:, :N_FJETS] - fj_neutralenergyfrac = fj_neutralenergyfrac[:, :N_FJETS] - fj_chargedenergyfrac = fj_chargedenergyfrac[:, :N_FJETS] - fj_nneutral = fj_nneutral[:, :N_FJETS] - fj_ncharged = fj_ncharged[:, :N_FJETS] - fj_higgs_idx = fj_higgs_idx[:, :N_FJETS] + # keep only top n_fjets + n_fjets = n_higgs + fj_pt = fj_pt[:, :n_fjets] + fj_eta = fj_eta[:, :n_fjets] + fj_phi = fj_phi[:, :n_fjets] + fj_mass = fj_mass[:, :n_fjets] + fj_sdmass = fj_sdmass[:, :n_fjets] + fj_tau21 = fj_tau21[:, :n_fjets] + fj_tau32 = fj_tau32[:, :n_fjets] + fj_charge = fj_charge[:, :n_fjets] + fj_ehadovereem = fj_ehadovereem[:, :n_fjets] + fj_neutralenergyfrac = fj_neutralenergyfrac[:, :n_fjets] + fj_chargedenergyfrac = fj_chargedenergyfrac[:, :n_fjets] + fj_nneutral = fj_nneutral[:, :n_fjets] + fj_ncharged = fj_ncharged[:, :n_fjets] + fj_higgs_idx = fj_higgs_idx[:, :n_fjets] # mask to define zero-padded small-radius jets mask = pt > MIN_JET_PT @@ -190,52 +190,57 @@ def get_datasets(arrays): # index of small-radius jet if Higgs is reconstructed h1_bs = ak.local_index(higgs_idx)[higgs_idx == 1] h2_bs = ak.local_index(higgs_idx)[higgs_idx == 2] - h3_bs = ak.local_index(higgs_idx)[higgs_idx == 3] + if n_higgs == 3: + h3_bs = ak.local_index(higgs_idx)[higgs_idx == 3] # index of large-radius jet if Higgs is reconstructed h1_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 1] h2_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 2] - h3_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 3] + if n_higgs == 3: + h3_bb = ak.local_index(fj_higgs_idx)[fj_higgs_idx == 3] # check/fix small-radius jet truth (ensure max 2 small-radius jets per higgs) - check = ( - np.unique(ak.count(h1_bs, axis=-1)).to_list() - + np.unique(ak.count(h2_bs, axis=-1)).to_list() - + np.unique(ak.count(h3_bs, axis=-1)).to_list() - ) + check = np.unique(ak.count(h1_bs, axis=-1)).to_list() + np.unique(ak.count(h2_bs, axis=-1)).to_list() + if n_higgs == 3: + check += np.unique(ak.count(h3_bs, axis=-1)).to_list() + if 3 in check: logging.warning("some Higgs bosons match to 3 small-radius jets! Check truth") # check/fix large-radius jet truth (ensure max 1 large-radius jet per higgs) - fj_check = ( - np.unique(ak.count(h1_bb, axis=-1)).to_list() - + np.unique(ak.count(h2_bb, axis=-1)).to_list() - + np.unique(ak.count(h3_bb, axis=-1)).to_list() - ) + fj_check = np.unique(ak.count(h1_bb, axis=-1)).to_list() + np.unique(ak.count(h2_bb, axis=-1)).to_list() + if n_higgs == 3: + fj_check += np.unique(ak.count(h3_bb, axis=-1)).to_list() + if 2 in fj_check: logging.warning("some Higgs bosons match to 2 large-radius jets! Check truth") h1_bs = ak.fill_none(ak.pad_none(h1_bs, 2, clip=True), -1) h2_bs = ak.fill_none(ak.pad_none(h2_bs, 2, clip=True), -1) - h3_bs = ak.fill_none(ak.pad_none(h3_bs, 2, clip=True), -1) + if n_higgs == 3: + h3_bs = ak.fill_none(ak.pad_none(h3_bs, 2, clip=True), -1) h1_bb = ak.fill_none(ak.pad_none(h1_bb, 1, clip=True), -1) h2_bb = ak.fill_none(ak.pad_none(h2_bb, 1, clip=True), -1) - h3_bb = ak.fill_none(ak.pad_none(h3_bb, 1, clip=True), -1) + if n_higgs == 3: + h3_bb = ak.fill_none(ak.pad_none(h3_bb, 1, clip=True), -1) h1_b1, h1_b2 = h1_bs[:, 0], h1_bs[:, 1] h2_b1, h2_b2 = h2_bs[:, 0], h2_bs[:, 1] - h3_b1, h3_b2 = h3_bs[:, 0], h3_bs[:, 1] + if n_higgs == 3: + h3_b1, h3_b2 = h3_bs[:, 0], h3_bs[:, 1] # mask whether Higgs can be reconstructed as 2 small-radius jet h1_mask = ak.all(h1_bs != -1, axis=-1) h2_mask = ak.all(h2_bs != -1, axis=-1) - h3_mask = ak.all(h3_bs != -1, axis=-1) + if n_higgs == 3: + h3_mask = ak.all(h3_bs != -1, axis=-1) # mask whether Higgs can be reconstructed as 1 large-radius jet h1_fj_mask = ak.all(h1_bb != -1, axis=-1) h2_fj_mask = ak.all(h2_bb != -1, axis=-1) - h3_fj_mask = ak.all(h3_bb != -1, axis=-1) + if n_higgs == 3: + h3_fj_mask = ak.all(h3_bb != -1, axis=-1) datasets = {} datasets["INPUTS/Jets/MASK"] = to_np_array(mask, max_n=N_JETS).astype("bool") @@ -249,22 +254,22 @@ def get_datasets(arrays): datasets["INPUTS/Jets/flavor"] = to_np_array(flavor, max_n=N_JETS).astype("float32") datasets["INPUTS/Jets/matchedfj"] = to_np_array(matched_fj_idx, max_n=N_JETS).astype("int32") - datasets["INPUTS/BoostedJets/MASK"] = to_np_array(fj_mask, max_n=N_FJETS).astype("bool") - datasets["INPUTS/BoostedJets/fj_pt"] = to_np_array(fj_pt, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_eta"] = to_np_array(fj_eta, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_phi"] = to_np_array(fj_phi, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_sinphi"] = to_np_array(np.sin(fj_phi), max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_cosphi"] = to_np_array(np.cos(fj_phi), max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_mass"] = to_np_array(fj_mass, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_sdmass"] = to_np_array(fj_sdmass, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_tau21"] = to_np_array(fj_tau21, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_tau32"] = to_np_array(fj_tau32, max_n=N_FJETS).astype("float32") - datasets["INPUTS/BoostedJets/fj_charge"] = to_np_array(fj_charge, max_n=N_FJETS) - datasets["INPUTS/BoostedJets/fj_ehadovereem"] = to_np_array(fj_ehadovereem, max_n=N_FJETS) - datasets["INPUTS/BoostedJets/fj_neutralenergyfrac"] = to_np_array(fj_neutralenergyfrac, max_n=N_FJETS) - datasets["INPUTS/BoostedJets/fj_chargedenergyfrac"] = to_np_array(fj_chargedenergyfrac, max_n=N_FJETS) - datasets["INPUTS/BoostedJets/fj_nneutral"] = to_np_array(fj_nneutral, max_n=N_FJETS) - datasets["INPUTS/BoostedJets/fj_ncharged"] = to_np_array(fj_ncharged, max_n=N_FJETS) + datasets["INPUTS/BoostedJets/MASK"] = to_np_array(fj_mask, max_n=n_fjets).astype("bool") + datasets["INPUTS/BoostedJets/fj_pt"] = to_np_array(fj_pt, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_eta"] = to_np_array(fj_eta, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_phi"] = to_np_array(fj_phi, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_sinphi"] = to_np_array(np.sin(fj_phi), max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_cosphi"] = to_np_array(np.cos(fj_phi), max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_mass"] = to_np_array(fj_mass, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_sdmass"] = to_np_array(fj_sdmass, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_tau21"] = to_np_array(fj_tau21, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_tau32"] = to_np_array(fj_tau32, max_n=n_fjets).astype("float32") + datasets["INPUTS/BoostedJets/fj_charge"] = to_np_array(fj_charge, max_n=n_fjets) + datasets["INPUTS/BoostedJets/fj_ehadovereem"] = to_np_array(fj_ehadovereem, max_n=n_fjets) + datasets["INPUTS/BoostedJets/fj_neutralenergyfrac"] = to_np_array(fj_neutralenergyfrac, max_n=n_fjets) + datasets["INPUTS/BoostedJets/fj_chargedenergyfrac"] = to_np_array(fj_chargedenergyfrac, max_n=n_fjets) + datasets["INPUTS/BoostedJets/fj_nneutral"] = to_np_array(fj_nneutral, max_n=n_fjets) + datasets["INPUTS/BoostedJets/fj_ncharged"] = to_np_array(fj_ncharged, max_n=n_fjets) datasets["TARGETS/h1/mask"] = h1_mask.to_numpy() datasets["TARGETS/h1/b1"] = h1_b1.to_numpy() @@ -274,9 +279,10 @@ def get_datasets(arrays): datasets["TARGETS/h2/b1"] = h2_b1.to_numpy() datasets["TARGETS/h2/b2"] = h2_b2.to_numpy() - datasets["TARGETS/h3/mask"] = h3_mask.to_numpy() - datasets["TARGETS/h3/b1"] = h3_b1.to_numpy() - datasets["TARGETS/h3/b2"] = h3_b2.to_numpy() + if n_higgs == 3: + datasets["TARGETS/h3/mask"] = h3_mask.to_numpy() + datasets["TARGETS/h3/b1"] = h3_b1.to_numpy() + datasets["TARGETS/h3/b2"] = h3_b2.to_numpy() datasets["TARGETS/bh1/mask"] = h1_fj_mask.to_numpy() datasets["TARGETS/bh1/bb"] = h1_bb.to_numpy().reshape(h1_fj_mask.to_numpy().shape) @@ -284,17 +290,29 @@ def get_datasets(arrays): datasets["TARGETS/bh2/mask"] = h2_fj_mask.to_numpy() datasets["TARGETS/bh2/bb"] = h2_bb.to_numpy().reshape(h2_fj_mask.to_numpy().shape) - datasets["TARGETS/bh3/mask"] = h3_fj_mask.to_numpy() - datasets["TARGETS/bh3/bb"] = h3_bb.to_numpy().reshape(h3_fj_mask.to_numpy().shape) + if n_higgs == 3: + datasets["TARGETS/bh3/mask"] = h3_fj_mask.to_numpy() + datasets["TARGETS/bh3/bb"] = h3_bb.to_numpy().reshape(h3_fj_mask.to_numpy().shape) return datasets @click.command() @click.argument("in-files", nargs=-1) -@click.option("--out-file", default=f"{PROJECT_DIR}/data/delphes/hhh_training.h5", help="Output file.") +@click.option( + "--out-file", + default=f"{PROJECT_DIR}/data/delphes/hhh_training.h5", + help="Output file.", +) @click.option("--train-frac", default=0.95, help="Fraction for training.") -def main(in_files, out_file, train_frac): +@click.option( + "--n-higgs", + "n_higgs", + default=3, + type=click.IntRange(2, 3), + help="Number of Higgs bosons per event", +) +def main(in_files, out_file, train_frac, n_higgs): all_datasets = {} for file_name in in_files: with uproot.open(file_name) as in_file: @@ -313,7 +331,7 @@ def main(in_files, out_file, train_frac): + [key for key in events.keys() if "FatJet/FatJet." in key and "fBits" not in key] ) arrays = events.arrays(keys, entry_start=entry_start, entry_stop=entry_stop) - datasets = get_datasets(arrays) + datasets = get_datasets(arrays, n_higgs) for dataset_name, data in datasets.items(): if dataset_name not in all_datasets: all_datasets[dataset_name] = [] diff --git a/src/models/test_baseline.py b/src/models/test_baseline.py index 14be37b..4dd7599 100644 --- a/src/models/test_baseline.py +++ b/src/models/test_baseline.py @@ -9,7 +9,7 @@ import vector from spanet.test import display_table, evaluate_predictions -from src.data.cms.convert_to_h5 import MIN_JETS, N_JETS +from src.data.cms.convert_to_h5 import N_JETS vector.register_awkward() @@ -17,18 +17,37 @@ HIGGS_MASS = 125.0 PROJECT_DIR = Path(__file__).resolve().parents[2] -# precompute possible jet assignments lookup table -JET_ASSIGNMENTS = {} -for nj in range(MIN_JETS, N_JETS + 1): - a = list(itertools.combinations(range(nj), 2)) - b = np.array([(i, j, k) for i, j, k in itertools.combinations(a, 3) if len(set(i + j + k)) == MIN_JETS]) - JET_ASSIGNMENTS[nj] = b @click.command() @click.option("--test-file", default=f"{PROJECT_DIR}/data/hhh_testing.h5", help="File for testing") @click.option("--event-file", default=f"{PROJECT_DIR}/event_files/cms/hhh.yaml", help="Event file") -def main(test_file, event_file): +@click.option( + "--n-higgs", + "n_higgs", + default=3, + type=click.IntRange(2, 3), + help="Number of Higgs bosons per event", +) +@click.option( + "--method", + default="standard", + type=click.Choice(["standard", "agnostic"]), + help="Baseline method to be tested", +) +def main(test_file, event_file, n_higgs, method): + # Checks to see if click flags are valid + if method == "agnostic" and n_higgs == 3: + raise ValueError("Invalid baseline method selected.") + + MIN_JETS = 2 * n_higgs + # compute possible jet assignments lookup table + JET_ASSIGNMENTS = {} + for nj in range(MIN_JETS, N_JETS + 1): + a = list(itertools.combinations(range(nj), 2)) + b = np.array([(i, j, k) for i, j, k in itertools.combinations(a, 3) if len(set(i + j + k)) == MIN_JETS]) + JET_ASSIGNMENTS[nj] = b + in_file = h5py.File(test_file) pt = ak.Array(in_file["INPUTS"]["Jets"]["pt"]) @@ -55,10 +74,43 @@ def main(test_file, event_file): ) # just consider top-6 jets - nj = 6 - mjj = (jets[:, JET_ASSIGNMENTS[nj][:, :, 0]] + jets[:, JET_ASSIGNMENTS[nj][:, :, 1]]).mass - chi2 = ak.sum(np.square(mjj - HIGGS_MASS), axis=-1) - chi2_argmin = ak.argmin(chi2, axis=-1) + nj = 2 * n_higgs + if method == "standard": + mjj = (jets[:, JET_ASSIGNMENTS[nj][:, :, 0]] + jets[:, JET_ASSIGNMENTS[nj][:, :, 1]]).mass + chi2 = ak.sum(np.square(mjj - HIGGS_MASS), axis=-1) + chi2_argmin = ak.argmin(chi2, axis=-1) + elif method == "agnostic": + k = 125 / 120 + + # implement algorithm from p.6 of https://cds.cern.ch/record/2771912/files/HIG-20-005-pas.pdf + # get array of dijets for each possible higgs combination + jj = jets[:, JET_ASSIGNMENTS[nj][:, :, 0]] + jets[:, JET_ASSIGNMENTS[nj][:, :, 1]] + mjj = jj.mass + mjj_sorted = ak.sort(mjj, ascending=False) + + # compute \delta d as defined in paper above + # and sort based on distance between first and second \delta d + delta_d = np.absolute(mjj_sorted[:, :, 0] - k * mjj_sorted[:, :, 1]) / (1 + k**2) + d_sorted = ak.sort(delta_d, ascending=False) + d_sep_mask = d_sorted[:, 0] - d_sorted[:, 1] > 30 + chi2_argmin = [] + + # get array of sum of pt of dijets in their own event CoM frame + com_pt = jj[:, :, 0].boostCM_of(jj[:, :, 0] + jj[:, :, 1]).pt + jj[:, :, 1].boostCM_of(jj[:, :, 0] + jj[:, :, 1]).pt + + # if \delta d separation is large, take event with smallest \delta d + # otherwise, take dijet combination with highest sum pt in their CoM frame + # that isn't the lowest \delta d separation + for i in range(len(d_sep_mask)): + if d_sep_mask[i]: + chi2_argmin.append(ak.argmin(delta_d[i], axis=-1)) + else: + if ak.argmin(delta_d[i], axis=-1) == ak.argmax(com_pt[i], axis=-1): + temp_arr = ak.to_numpy(com_pt[i]) + temp_arr[ak.argmax(com_pt[i])] = 0 + chi2_argmin.append(ak.argmax(temp_arr)) + else: + chi2_argmin.append(ak.argmax(com_pt[i], axis=-1)) h1_bs = np.concatenate( ( @@ -74,29 +126,45 @@ def main(test_file, event_file): ), axis=-1, ) - h3_bs = np.concatenate( - ( - np.array(in_file["TARGETS"]["h3"]["b1"])[:, np.newaxis], - np.array(in_file["TARGETS"]["h3"]["b2"])[:, np.newaxis], - ), - axis=-1, - ) - targets = [h1_bs, h2_bs, h3_bs] - - masks = np.concatenate( - ( - np.array(in_file["TARGETS"]["h1"]["mask"])[np.newaxis, :], - np.array(in_file["TARGETS"]["h2"]["mask"])[np.newaxis, :], - np.array(in_file["TARGETS"]["h3"]["mask"])[np.newaxis, :], - ), - axis=0, - ) - - predictions = [ - JET_ASSIGNMENTS[nj][chi2_argmin][:, 0, :], - JET_ASSIGNMENTS[nj][chi2_argmin][:, 1, :], - JET_ASSIGNMENTS[nj][chi2_argmin][:, 2, :], - ] + if n_higgs == 3: + h3_bs = np.concatenate( + ( + np.array(in_file["TARGETS"]["h3"]["b1"])[:, np.newaxis], + np.array(in_file["TARGETS"]["h3"]["b2"])[:, np.newaxis], + ), + axis=-1, + ) + targets = [h1_bs, h2_bs, h3_bs] + + masks = np.concatenate( + ( + np.array(in_file["TARGETS"]["h1"]["mask"])[np.newaxis, :], + np.array(in_file["TARGETS"]["h2"]["mask"])[np.newaxis, :], + np.array(in_file["TARGETS"]["h3"]["mask"])[np.newaxis, :], + ), + axis=0, + ) + + predictions = [ + JET_ASSIGNMENTS[nj][chi2_argmin][:, 0, :], + JET_ASSIGNMENTS[nj][chi2_argmin][:, 1, :], + JET_ASSIGNMENTS[nj][chi2_argmin][:, 2, :], + ] + else: + targets = [h1_bs, h2_bs] + + masks = np.concatenate( + ( + np.array(in_file["TARGETS"]["h1"]["mask"])[np.newaxis, :], + np.array(in_file["TARGETS"]["h2"]["mask"])[np.newaxis, :], + ), + axis=0, + ) + + predictions = [ + JET_ASSIGNMENTS[nj][chi2_argmin][:, 0, :], + JET_ASSIGNMENTS[nj][chi2_argmin][:, 1, :], + ] num_vectors = np.sum(mask, axis=-1).to_numpy() lines = 2