"""
This module implements method to generate a deterministic, physics-inspired
toy dataset. The dataset is intended for documentations and examples. The
module does not rely on external random number generators (seeding numpy
might break user code).
"""
import math
import numpy as np
from scipy.integrate import quad
from scipy.interpolate import interp1d
import pandas as pd
import dask.dataframe as dd
from pylorentz import Momentum4
[docs]def draw(rng, pdf, size=1, lower=0, upper=1, N=100):
"""
Draws a size-shaped random sample from the given PDF. The PDF must be
normalized to unity withing the given limits.
"""
grid = np.linspace(lower, upper, N)
cdf = np.array([quad(pdf, lower, x)[0] for x in grid])
inv_cdf = interp1d(cdf, grid, bounds_error=False,
fill_value=(0, 1))
ys = rng.uniform(size=size)
return inv_cdf(ys)
rng = np.random.RandomState(2221006818) # random seed
[docs]def augment(point):
jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \
met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \
random_value = point
jet_1 = Momentum4.m_eta_phi_pt(0, jet_1_eta, jet_1_phi, jet_1_pt)
jet_2 = Momentum4.m_eta_phi_pt(0, jet_2_eta, jet_2_phi, jet_2_pt)
tau = Momentum4.m_eta_phi_pt(1.8, tau_eta, tau_phi, tau_pt)
lep = Momentum4.m_eta_phi_pt(0.1, lep_eta, lep_phi, lep_pt)
met = Momentum4.m_eta_phi_pt(0.0, 0, met_phi, met_pt)
lep_centrality = np.exp(- 4 / (jet_1_eta - jet_2_eta)**2 * (lep_eta * (jet_1_eta + jet_2_eta)/2)**2)
tau_centrality = np.exp(- 4 / (jet_1_eta - jet_2_eta)**2 * (tau_eta * (jet_1_eta + jet_2_eta)/2)**2)
higgs = tau + lep + met
return jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \
met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \
higgs.p_t, higgs.eta, higgs.phi, higgs.m, (jet_1 + jet_2).m, \
lep_centrality, tau_centrality, random_value
[docs]def generate(total, vbfh_frac=0.2, shuffle=True):
n_vbfh = math.ceil(vbfh_frac * total)
n_ztt = total - n_vbfh
df_vbfh = mcmc(n_vbfh, vbfh_pdf)
df_vbfh['fpid'] = 1
df_vbfh['weight'] = 1
df_ztt = mcmc(n_ztt, ztt_pdf)
df_ztt['fpid'] = 0
df_ztt['weight'] = 1
df = pd.concat([df_vbfh, df_ztt])
if shuffle:
df = df.sample(frac=1, random_state=rng)
df.reset_index(drop=True, inplace=True)
return df
[docs]def vbfh_pdf(point):
"""
Returns the relative probability density at the given point. The function
is not properly normalized. The outer dimension of the point contains the
following values:
- jet_1_pt
- jet_1_eta
- jet_1_phi
- jet_2_pt
- jet_2_eta
- jet_2_phi
- met_phi
- met_pt
- tau_phi
- tau_eta
- tau_pt
- lep_phi
- lep_eta
- lep_pt
- random value
"""
overall = 1.
point = augment(point)
jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \
met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \
higgs_pt, higgs_eta, higgs_phi, higgs_m, m_jj, lep_centrality, \
tau_centrality, random_value = point
# jet system
overall *= (jet_1_pt > jet_2_pt)
overall *= abs(jet_1_eta - jet_2_eta)
overall *= abs(jet_1_eta) * np.exp(-abs(jet_1_eta)**2 / 9)
overall *= abs(jet_2_eta) * np.exp(-abs(jet_2_eta)**2 / 9)
overall *= (jet_1_pt > 25)
overall *= (jet_2_pt > 25)
overall *= 1 / (jet_1_pt)
overall *= 1 / (jet_2_pt)
# Higgs system
overall *= np.exp(-0.5 * (higgs_m - 125)**2 / 15**2)
overall *= 1 / (1 + np.exp((m_jj-1000) / 500))
overall *= 1 / (1 + np.exp((m_jj-1000) / 500))
overall *= (met_pt > 0)
overall *= (lep_pt > 10)
overall *= (tau_pt > 20)
overall *= np.exp(-lep_pt / 180)
overall *= np.exp(-tau_pt / 180)
overall *= np.exp(-met_pt / 50)
overall *= (lep_centrality >= 0)
overall *= (lep_centrality < 1)
overall *= lep_centrality / 2 + 0.5
overall *= (tau_centrality >= 0)
overall *= (tau_centrality < 1)
overall *= tau_centrality / 2 + 0.5
return overall
[docs]def ztt_pdf(point):
overall = 1.
point = augment(point)
jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \
met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \
higgs_pt, higgs_eta, higgs_phi, higgs_m, m_jj, lep_centrality, \
tau_centrality, random_value = point
# jet system
overall *= (jet_1_pt > jet_2_pt)
overall *= np.exp(-abs(jet_1_eta)**2 / 9)
overall *= np.exp(-abs(jet_2_eta)**2 / 9)
overall *= (jet_1_pt > 25)
overall *= (jet_2_pt > 25)
overall *= np.exp(-jet_1_pt / 100)
overall *= np.exp(-jet_2_pt / 60)
# Z system
overall *= np.exp(-0.5 * (higgs_m - 90)**2 / 10**2)
overall *= (met_pt > 0)
overall *= (lep_pt > 10)
overall *= (tau_pt > 20)
overall *= np.exp(-lep_pt / 150)
overall *= np.exp(-tau_pt / 150)
overall *= np.exp(-met_pt / 50)
overall *= (lep_centrality >= 0)
overall *= (lep_centrality < 1)
overall *= (1 - lep_centrality / 2)
overall *= (tau_centrality >= 0)
overall *= (tau_centrality < 1)
overall *= (1 - tau_centrality / 2)
overall *= np.exp(-m_jj / 200)
return overall
[docs]def proposal(point):
jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \
met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \
random_value = point
jet_1_pt = rng.normal(jet_1_pt, 10)
jet_2_pt = rng.normal(jet_2_pt, 10)
tau_pt = rng.normal(tau_pt, 10)
lep_pt = rng.normal(lep_pt, 10)
met_pt = rng.normal(met_pt, 10)
flip = rng.choice([1, -1])
jet_1_eta = rng.normal(jet_1_eta, 0.2) * flip
jet_2_eta = rng.normal(jet_2_eta, 0.2) * flip
tau_eta = rng.normal(tau_eta, 0.2) * flip
lep_eta = rng.normal(lep_eta, 0.2) * flip
rot = rng.uniform(0, 2* np.pi)
jet_1_phi = rng.normal(jet_1_phi + rot, 0.1) % (2 * np.pi)
jet_2_phi = rng.normal(jet_2_phi + rot, 0.1) % (2 * np.pi)
tau_phi = rng.normal(tau_phi + rot, 0.1) % (2 * np.pi)
lep_phi = rng.normal(lep_phi + rot, 0.1) % (2 * np.pi)
met_phi = rng.normal(met_phi + rot, 0.1) % (2 * np.pi)
random_value = rng.random()
return jet_1_pt, jet_1_eta, jet_1_phi, jet_2_pt, jet_2_eta, jet_2_phi, met_phi, \
met_pt, tau_phi, tau_eta, tau_pt, lep_phi, lep_eta, lep_pt, \
random_value
[docs]def mcmc_step(x, pdf):
"""
Perform a single step of Markov chain Monte Carlo.
"""
while True:
new_x = proposal(x)
alpha = pdf(new_x) / pdf(x)
u = rng.random()
if u <= alpha:
return new_x
[docs]def mcmc(length, pdf):
"""
Generate length many samples.
"""
point = 100, 1, 1, 50, 2, 1, 1, 1, 1, 1, 40, 1, 1, 30, 0
for i in range(1000):
point = mcmc_step(point, pdf)
points = []
for i in range(length):
point = mcmc_step(point, pdf)
points.append(augment(point))
return pd.DataFrame(points,
columns=["jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_2_pt", "jet_2_eta", "jet_2_phi",
"met_phi", "met_pt", "tau_phi", "tau_eta", "tau_pt", "lep_phi", "lep_eta",
"lep_pt", "higgs_pt", "higgs_eta", "higgs_phi", "higgs_m",
"m_jj", "lep_centrality", "tau_centrality", "random"])
[docs]def get():
try:
df = dd.read_hdf(".toydata.h5", "main")
except Exception:
print("Cannot find cached data. Recreating toy data. This might take "
"some time...")
df = generate(10000)
df.to_hdf(".toydata.h5", "main", format="table")
df = dd.read_hdf(".toydata.h5", "main")
return df