Source code for nnfwtbn.toydata

"""
This module implements method to generate a deterministic, physics-inspired
toy dataset. The dataset is intended for documentations and examples. The
module does not rely on external random number generators (seeding numpy
might break user code).
"""

import math

import numpy as np
from scipy.integrate import quad
from scipy.interpolate import interp1d
import pandas as pd
import dask.dataframe as dd

from pylorentz import Momentum4

[docs]def draw(rng, pdf, size=1, lower=0, upper=1, N=100): """ Draws a size-shaped random sample from the given PDF. The PDF must be normalized to unity withing the given limits. """ grid = np.linspace(lower, upper, N) cdf = np.array([quad(pdf, lower, x)[0] for x in grid]) inv_cdf = interp1d(cdf, grid, bounds_error=False, fill_value=(0, 1)) ys = rng.uniform(size=size) return inv_cdf(ys)
rng = np.random.RandomState(2221006818) # random seed
[docs]def augment(point): jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \ met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \ random_value = point jet_1 = Momentum4.m_eta_phi_pt(0, jet_1_eta, jet_1_phi, jet_1_pt) jet_2 = Momentum4.m_eta_phi_pt(0, jet_2_eta, jet_2_phi, jet_2_pt) tau = Momentum4.m_eta_phi_pt(1.8, tau_eta, tau_phi, tau_pt) lep = Momentum4.m_eta_phi_pt(0.1, lep_eta, lep_phi, lep_pt) met = Momentum4.m_eta_phi_pt(0.0, 0, met_phi, met_pt) lep_centrality = np.exp(- 4 / (jet_1_eta - jet_2_eta)**2 * (lep_eta * (jet_1_eta + jet_2_eta)/2)**2) tau_centrality = np.exp(- 4 / (jet_1_eta - jet_2_eta)**2 * (tau_eta * (jet_1_eta + jet_2_eta)/2)**2) higgs = tau + lep + met return jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \ met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \ higgs.p_t, higgs.eta, higgs.phi, higgs.m, (jet_1 + jet_2).m, \ lep_centrality, tau_centrality, random_value
[docs]def generate(total, vbfh_frac=0.2, shuffle=True): n_vbfh = math.ceil(vbfh_frac * total) n_ztt = total - n_vbfh df_vbfh = mcmc(n_vbfh, vbfh_pdf) df_vbfh['fpid'] = 1 df_vbfh['weight'] = 1 df_ztt = mcmc(n_ztt, ztt_pdf) df_ztt['fpid'] = 0 df_ztt['weight'] = 1 df = pd.concat([df_vbfh, df_ztt]) if shuffle: df = df.sample(frac=1, random_state=rng) df.reset_index(drop=True, inplace=True) return df
[docs]def vbfh_pdf(point): """ Returns the relative probability density at the given point. The function is not properly normalized. The outer dimension of the point contains the following values: - jet_1_pt - jet_1_eta - jet_1_phi - jet_2_pt - jet_2_eta - jet_2_phi - met_phi - met_pt - tau_phi - tau_eta - tau_pt - lep_phi - lep_eta - lep_pt - random value """ overall = 1. point = augment(point) jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \ met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \ higgs_pt, higgs_eta, higgs_phi, higgs_m, m_jj, lep_centrality, \ tau_centrality, random_value = point # jet system overall *= (jet_1_pt > jet_2_pt) overall *= abs(jet_1_eta - jet_2_eta) overall *= abs(jet_1_eta) * np.exp(-abs(jet_1_eta)**2 / 9) overall *= abs(jet_2_eta) * np.exp(-abs(jet_2_eta)**2 / 9) overall *= (jet_1_pt > 25) overall *= (jet_2_pt > 25) overall *= 1 / (jet_1_pt) overall *= 1 / (jet_2_pt) # Higgs system overall *= np.exp(-0.5 * (higgs_m - 125)**2 / 15**2) overall *= 1 / (1 + np.exp((m_jj-1000) / 500)) overall *= 1 / (1 + np.exp((m_jj-1000) / 500)) overall *= (met_pt > 0) overall *= (lep_pt > 10) overall *= (tau_pt > 20) overall *= np.exp(-lep_pt / 180) overall *= np.exp(-tau_pt / 180) overall *= np.exp(-met_pt / 50) overall *= (lep_centrality >= 0) overall *= (lep_centrality < 1) overall *= lep_centrality / 2 + 0.5 overall *= (tau_centrality >= 0) overall *= (tau_centrality < 1) overall *= tau_centrality / 2 + 0.5 return overall
[docs]def ztt_pdf(point): overall = 1. point = augment(point) jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \ met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \ higgs_pt, higgs_eta, higgs_phi, higgs_m, m_jj, lep_centrality, \ tau_centrality, random_value = point # jet system overall *= (jet_1_pt > jet_2_pt) overall *= np.exp(-abs(jet_1_eta)**2 / 9) overall *= np.exp(-abs(jet_2_eta)**2 / 9) overall *= (jet_1_pt > 25) overall *= (jet_2_pt > 25) overall *= np.exp(-jet_1_pt / 100) overall *= np.exp(-jet_2_pt / 60) # Z system overall *= np.exp(-0.5 * (higgs_m - 90)**2 / 10**2) overall *= (met_pt > 0) overall *= (lep_pt > 10) overall *= (tau_pt > 20) overall *= np.exp(-lep_pt / 150) overall *= np.exp(-tau_pt / 150) overall *= np.exp(-met_pt / 50) overall *= (lep_centrality >= 0) overall *= (lep_centrality < 1) overall *= (1 - lep_centrality / 2) overall *= (tau_centrality >= 0) overall *= (tau_centrality < 1) overall *= (1 - tau_centrality / 2) overall *= np.exp(-m_jj / 200) return overall
[docs]def proposal(point): jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \ met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \ random_value = point jet_1_pt = rng.normal(jet_1_pt, 10) jet_2_pt = rng.normal(jet_2_pt, 10) tau_pt = rng.normal(tau_pt, 10) lep_pt = rng.normal(lep_pt, 10) met_pt = rng.normal(met_pt, 10) flip = rng.choice([1, -1]) jet_1_eta = rng.normal(jet_1_eta, 0.2) * flip jet_2_eta = rng.normal(jet_2_eta, 0.2) * flip tau_eta = rng.normal(tau_eta, 0.2) * flip lep_eta = rng.normal(lep_eta, 0.2) * flip rot = rng.uniform(0, 2* np.pi) jet_1_phi = rng.normal(jet_1_phi + rot, 0.1) % (2 * np.pi) jet_2_phi = rng.normal(jet_2_phi + rot, 0.1) % (2 * np.pi) tau_phi = rng.normal(tau_phi + rot, 0.1) % (2 * np.pi) lep_phi = rng.normal(lep_phi + rot, 0.1) % (2 * np.pi) met_phi = rng.normal(met_phi + rot, 0.1) % (2 * np.pi) random_value = rng.random() return jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \ met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \ random_value
[docs]def mcmc_step(x, pdf): """ Perform a single step of Markov chain Monte Carlo. """ while True: new_x = proposal(x) alpha = pdf(new_x) / pdf(x) u = rng.random() if u <= alpha: return new_x
[docs]def mcmc(length, pdf): """ Generate length many samples. """ point = 100, 1, 1, 50, 2, 1, 1, 1, 1, 1, 40, 1, 1, 30, 0 for i in range(1000): point = mcmc_step(point, pdf) points = [] for i in range(length): point = mcmc_step(point, pdf) points.append(augment(point)) return pd.DataFrame(points, columns=["jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_2_pt", "jet_2_eta", "jet_2_phi", "met_phi", "met_pt", "tau_phi", "tau_eta", "tau_pt", "lep_phi", "lep_eta", "lep_pt", "higgs_pt", "higgs_eta", "higgs_phi", "higgs_m", "m_jj", "lep_centrality", "tau_centrality", "random"])
[docs]def get(): try: df = dd.read_hdf(".toydata.h5", "main") except Exception: print("Cannot find cached data. Recreating toy data. This might take " "some time...") df = generate(10000) df.to_hdf(".toydata.h5", "main", format="table") df = dd.read_hdf(".toydata.h5", "main") return df