Source code for nnfwtbn.stack


import numpy as np
import dask.array as da
import dask.dataframe as dd

from nnfwtbn.variable import Variable
from nnfwtbn.cut import Cut
import nnfwtbn.error as err

[docs]def not_none_or_default(value, default): """ Returns the value if it is not None. Otherwise returns the default. """ if value is not None: return value return default
[docs]class Stack: """ This class represents a collection of Prcesses drawn as a stack in histograms created with hist(). The Stack class stores information about the plotting style (e.g. markersize, linestyle), the histogram type (step, stepfilled, points), the color wheel, and the method to compute the total uncertainty of the stack. A stack is not tied to a specific plot. It can be reused for plot with different binning, different variables or different selections. """
[docs] def __init__(self, *processes, histtype='stepfilled', data_uncertainty=False, palette=None, **aux): """ Creates a new stack and sets its default properties. If a process is added (via add_process()) to the stack without specifying a custom style, the defaults are used. The object is initialized with the processes passed to the method. """ self.processes = list(processes) self.histtypes = [None for _ in processes] self.data_uncertainties = [None for _ in processes] self.aux = [{} for _ in processes] self.default_aux = aux self.default_histtype = histtype self.default_data_uncertainty = data_uncertainty self.palette = palette
[docs] def add_process(self, process, histtype=None, data_uncertainty=None, **aux): """ Adds a new process to the stack. Arguments passed to this method the precedence over the default passed to the constructor. The process argument must be a Process object with information about the selection and the label in the legend. The histtype argument can take the value 'step', 'stepfilled', 'line', 'points'. This argument controls the type of the histogram. If the data_uncertainty is set to True, then the get_total_uncertainty() will return sqrt(get_total()). This method is useful when plotting Asimov data. If the option is False, the weights are used to compute the uncertainty. Additional keyword arguments are stored internally for the plotting method to be forwarded to matplotlib. """ self.processes.append(process) self.histtypes.append(histtype) self.data_uncertainties.append(data_uncertainty) self.aux.append(aux)
[docs] def get_hist(self, df, i, bins, variable, weight, include_outside=False): """ Returns the yields per bin for the i-th process in the stack. The bins argument specifies the bin edges. """ process = self.processes[i] df = process(df) variable = variable(df) weight = weight(df) if include_outside: bins = np.array(bins) low, high = dd.compute(variable.min(), variable.max()) bins[0] = min(low, bins[0]) bins[-1] = max(high, bins[-1]) if hasattr(variable, "to_dask_array"): # Assume Dask DataFrame variable = variable.to_dask_array() weight = weight.to_dask_array() func = da.histogram else: func = np.histogram total, _ = func(variable, bins=bins, weights=weight) return total
[docs] def get_total(self, df, bins, variable, weight, include_outside=False): """ Returns the sum of yields per bin of all processes. The bins argument specifies the bin edges. """ total = 0 for i in range(len(self.processes)): total += self.get_hist(df, i, bins, variable, weight, include_outside=include_outside) return total
[docs] def get_uncertainty(self, df, i, bins, variable, weight, include_outside=False): """ Returns the uncertainty of the total yield per bin. The bins argument specifies the bin edges. """ process = self.processes[i] df = process(df) weight_2 = Variable("$w^2$", lambda d: weight(d)**2) uncertainty_2 = 0 if self.is_data_uncertainty(i): uncertainty_2 += self.get_hist(df, i, bins, variable, weight, include_outside=include_outside) else: uncertainty_2 += self.get_hist(df, i, bins, variable, weight=weight_2, include_outside=include_outside) return np.sqrt(uncertainty_2)
[docs] def get_total_uncertainty(self, df, bins, variable, weight, include_outside=False): """ Returns the uncertainty of the total yield per bin. The bins argument specifies the bin edges. """ weight_2 = Variable("$w^2$", lambda d: weight(d)**2) uncertainty_2 = 0 for i in range(len(self.processes)): if self.is_data_uncertainty(i): uncertainty_2 += self.get_hist(df, i, bins, variable, weight, include_outside=include_outside) else: uncertainty_2 += self.get_hist(df, i, bins, variable, weight=weight_2, include_outside=include_outside) return np.sqrt(uncertainty_2)
[docs] def get_histtype(self, i): """ Return the histtype of process i. """ return not_none_or_default(self.histtypes[i], self.default_histtype)
[docs] def get_aux(self, i): """ Returns the auxiliary keyword arguments. The returned dict is a mix of the default keyword arguments updated by the ones used when adding a process. """ aux = {} aux.update(self.default_aux) aux.update(self.aux[i]) if self.palette is not None: aux.setdefault("color", self.palette[i % len(self.palette)]) return aux
[docs] def is_data_uncertainty(self, i): """ Returns True if process i uses data_uncertainties. """ return not_none_or_default(self.data_uncertainties[i], self.default_data_uncertainty)
[docs] def __len__(self): """ Returns the number of processes in this stack. """ return len(self.processes)
DEFAULT_VARIATION_VAR = 'variation' DEFAULT_NOMINAL = 'NOMINAL'
[docs]class SystStack(Stack):
[docs] def __init__(self, df, *args, **kwds): super().__init__(df, *args, **kwds) self.variation_var = DEFAULT_VARIATION_VAR self.skip_variations = {DEFAULT_NOMINAL} self.nominal = DEFAULT_NOMINAL
[docs] def get_total(self, df, *args, **kwds): nom_idx = (df[self.variation_var] == self.nominal) return super().get_total(df[nom_idx], *args, **kwds)
[docs] def get_hist(self, df, *args, **kwds): nom_idx = (df[self.variation_var] == self.nominal) return super().get_hist(df[nom_idx], *args, **kwds)
[docs] def get_stat_uncertainty(self, df, *args, **kwds): nom_idx = (df[self.variation_var] == self.nominal) return super().get_total_uncertainty(df[nom_idx], *args, **kwds)
[docs] def get_syst_uncertainty(self, df, bins, variable, weight, include_outside=False): nominal_hist = self.get_total(df, bins, variable, weight, include_outside=include_outside) envelop = [] for variation in df[self.variation_var].unique(): if variation in self.skip_variations: continue variation_hist = 0 # Can't use super().get_total because it falls back to this get_hist() for i in range(len(self.processes)): var_idx = (df[self.variation_var] == variation) variation_hist += super().get_hist(df[var_idx], i, bins, variable, weight, include_outside) variation_hist -= nominal_hist envelop.append(variation_hist) # envelop[variation] = variation_hist envelop = da.asarray(envelop) return da.sqrt((envelop**2).sum(axis=0))
[docs] def get_total_uncertainty(self, *args, **kwds): return da.sqrt( self.get_stat_uncertainty(*args, **kwds)**2 + self.get_syst_uncertainty(*args, **kwds)**2 )
[docs]class TruthStack(Stack):
[docs] def get_total_uncertainty(self, df, bins, *args, **kwds): return bins[:-1] * 0
[docs]class McStack(Stack): """ Short-hand class for a Stack with only Monte-Carlo-like processes. """
[docs] def __init__(self, *args, **kwds): if 'data_uncertainty' in kwds: raise TypeError("Cannot pass 'data_uncertainty' to McStack. " "Use generic Stack instead.") kwds['data_uncertainty'] = False kwds.setdefault("histtype", "stepfilled") super().__init__(*args, **kwds)
[docs] def add_process(self, *args, **kwds): if 'data_uncertainty' in kwds: raise TypeError("Cannot pass 'data_uncertainty' to McStack. " "Use generic Stack instead.") super().add_process(*args, **kwds)
[docs]class DataStack(Stack): """ Short-hand class for a Stack with only data-like processes. """
[docs] def __init__(self, *args, **kwds): if 'data_uncertainty' in kwds: raise TypeError("Cannot pass 'data_uncertainty' to DataStack. " "Use generic Stack instead.") kwds.setdefault("color", "black") kwds.setdefault("histtype", "points") kwds['data_uncertainty'] = True super().__init__(*args, **kwds)
[docs] def add_process(self, *args, **kwds): if 'data_uncertainty' in kwds: raise TypeError("Cannot pass 'data_uncertainty' to DataStack. " "Use generic Stack instead.") super().add_process(*args, **kwds)