Source code for nnfwtbn.stack


import numpy as np
import dask.array as da
import dask.dataframe as dd

from nnfwtbn.variable import Variable
from nnfwtbn.cut import Cut
import nnfwtbn.error as err

[docs]def not_none_or_default(value, default):
    """
    Returns the value if it is not None. Otherwise returns the default.
    """
    if value is not None:
        return value
    return default

[docs]class Stack:
    """
    This class represents a collection of Prcesses drawn as a stack in
    histograms created with hist(). The Stack class stores information about
    the plotting style (e.g. markersize, linestyle), the histogram type
    (step, stepfilled, points), the color wheel, and the method to compute
    the total uncertainty of the stack.

    A stack is not tied to a specific plot. It can be reused for plot with
    different binning, different variables or different selections.
    """

[docs]    def __init__(self, *processes, histtype='stepfilled',
                 data_uncertainty=False, palette=None, **aux):
        """
        Creates a new stack and sets its default properties. If a process is
        added (via add_process()) to the stack without specifying a custom style, the
        defaults are used.

        The object is initialized with the processes passed to the method.
        """
        self.processes = list(processes)
        self.histtypes = [None for _ in processes]
        self.data_uncertainties = [None for _ in processes]
        self.aux = [{} for _ in processes]

        self.default_aux = aux
        self.default_histtype = histtype
        self.default_data_uncertainty = data_uncertainty
        self.palette = palette

[docs]    def add_process(self, process, histtype=None, data_uncertainty=None,
            **aux):
        """
        Adds a new process to the stack. Arguments passed to this method the
        precedence over the default passed to the constructor.
        
        The process argument must be a Process object with information about
        the selection and the label in the legend. The histtype argument can
        take the value 'step', 'stepfilled', 'line', 'points'. This argument
        controls the type of the histogram. If the data_uncertainty is set to
        True, then the get_total_uncertainty() will return sqrt(get_total()).
        This method is useful when plotting Asimov data. If the option is
        False, the weights are used to compute the uncertainty.

        Additional keyword arguments are stored internally for the plotting
        method to be forwarded to matplotlib.
        """
        self.processes.append(process)
        self.histtypes.append(histtype)
        self.data_uncertainties.append(data_uncertainty)
        self.aux.append(aux)

[docs]    def get_hist(self, df, i, bins, variable, weight, include_outside=False):
        """
        Returns the yields per bin for the i-th process in the stack. The bins
        argument specifies the bin edges.
        """
        process = self.processes[i]
        df = process(df)

        variable = variable(df)
        weight = weight(df)

        if include_outside:
            bins = np.array(bins)
            low, high = dd.compute(variable.min(), variable.max())
            bins[0] = min(low, bins[0])
            bins[-1] = max(high, bins[-1])

        if hasattr(variable, "to_dask_array"):
            # Assume Dask DataFrame
            variable = variable.to_dask_array()
            weight = weight.to_dask_array()
            func = da.histogram
        else:
            func = np.histogram

        total, _ = func(variable, bins=bins, weights=weight)
        return total

[docs]    def get_total(self, df, bins, variable, weight, include_outside=False):
        """
        Returns the sum of yields per bin of all processes. The bins argument
        specifies the bin edges.
        """
        total = 0
        for i in range(len(self.processes)):
            total += self.get_hist(df, i, bins, variable, weight,
                                   include_outside=include_outside)

        return total

[docs]    def get_uncertainty(self, df, i, bins, variable, weight,
                              include_outside=False):
        """
        Returns the uncertainty of the total yield per bin. The bins argument
        specifies the bin edges.
        """
        process = self.processes[i]
        df = process(df)

        weight_2 = Variable("$w^2$", lambda d: weight(d)**2)
        uncertainty_2 = 0

        if self.is_data_uncertainty(i):
            uncertainty_2 += self.get_hist(df, i, bins, variable, weight,
                                           include_outside=include_outside)
        else:
            uncertainty_2 += self.get_hist(df, i, bins, variable,
                                           weight=weight_2,
                                           include_outside=include_outside)

        return np.sqrt(uncertainty_2)

[docs]    def get_total_uncertainty(self, df, bins, variable, weight,
                              include_outside=False):
        """
        Returns the uncertainty of the total yield per bin. The bins argument
        specifies the bin edges.
        """
        weight_2 = Variable("$w^2$", lambda d: weight(d)**2)
        uncertainty_2 = 0

        for i in range(len(self.processes)):
            if self.is_data_uncertainty(i):
                uncertainty_2 += self.get_hist(df, i, bins, variable, weight,
                                               include_outside=include_outside)
            else:
                uncertainty_2 += self.get_hist(df, i, bins, variable,
                                               weight=weight_2,
                                               include_outside=include_outside)

        return np.sqrt(uncertainty_2)


[docs]    def get_histtype(self, i):
        """
        Return the histtype of process i.
        """
        return not_none_or_default(self.histtypes[i], self.default_histtype)

[docs]    def get_aux(self, i):
        """
        Returns the auxiliary keyword arguments. The returned dict is a mix of
        the default keyword arguments updated by the ones used when adding a
        process.
        """
        aux = {}
        aux.update(self.default_aux)
        aux.update(self.aux[i])

        if self.palette is not None:
            aux.setdefault("color", self.palette[i % len(self.palette)])

        return aux

[docs]    def is_data_uncertainty(self, i):
        """
        Returns True if process i uses data_uncertainties.
        """
        return not_none_or_default(self.data_uncertainties[i],
                                   self.default_data_uncertainty)

[docs]    def __len__(self):
        """
        Returns the number of processes in this stack.
        """
        return len(self.processes)


DEFAULT_VARIATION_VAR = 'variation'
DEFAULT_NOMINAL = 'NOMINAL'

[docs]class SystStack(Stack):
[docs]    def __init__(self, df, *args, **kwds):
        super().__init__(df, *args, **kwds)
        self.variation_var = DEFAULT_VARIATION_VAR
        self.skip_variations = {DEFAULT_NOMINAL}
        self.nominal = DEFAULT_NOMINAL
        
[docs]    def get_total(self, df, *args, **kwds):
        nom_idx = (df[self.variation_var] == self.nominal)
        return super().get_total(df[nom_idx], *args, **kwds)
    
[docs]    def get_hist(self, df, *args, **kwds):
        nom_idx = (df[self.variation_var] == self.nominal)
        return super().get_hist(df[nom_idx], *args, **kwds)
    
[docs]    def get_stat_uncertainty(self, df, *args, **kwds):
        nom_idx = (df[self.variation_var] == self.nominal)
        return super().get_total_uncertainty(df[nom_idx], *args, **kwds)
    
[docs]    def get_syst_uncertainty(self, df, bins, variable, weight,
                             include_outside=False):
        nominal_hist = self.get_total(df, bins, variable, weight,
                                      include_outside=include_outside)
        
        envelop = []
        for variation in df[self.variation_var].unique():
            if variation in self.skip_variations:
                continue
                
            variation_hist = 0
            
            # Can't use super().get_total because it falls back to this get_hist()
            for i in range(len(self.processes)):
                var_idx = (df[self.variation_var] == variation)
                variation_hist += super().get_hist(df[var_idx],
                                                   i, bins, variable, weight,
                                                   include_outside)
            variation_hist -= nominal_hist
            envelop.append(variation_hist)
            # envelop[variation] = variation_hist
            
        envelop = da.asarray(envelop)
        return da.sqrt((envelop**2).sum(axis=0))

[docs]    def get_total_uncertainty(self, *args, **kwds):
        return da.sqrt(
            self.get_stat_uncertainty(*args, **kwds)**2 +
            self.get_syst_uncertainty(*args, **kwds)**2
        )
    

[docs]class TruthStack(Stack):
    
[docs]    def get_total_uncertainty(self, df, bins, *args, **kwds):
        return bins[:-1] * 0

[docs]class McStack(Stack):
    """
    Short-hand class for a Stack with only Monte-Carlo-like processes.
    """

[docs]    def __init__(self, *args, **kwds):
        if 'data_uncertainty' in kwds:
            raise TypeError("Cannot pass 'data_uncertainty' to McStack. "
                            "Use generic Stack instead.")

        kwds['data_uncertainty'] = False
        kwds.setdefault("histtype", "stepfilled")
        super().__init__(*args, **kwds)

[docs]    def add_process(self, *args, **kwds):
        if 'data_uncertainty' in kwds:
            raise TypeError("Cannot pass 'data_uncertainty' to McStack. "
                            "Use generic Stack instead.")

        super().add_process(*args, **kwds)

[docs]class DataStack(Stack):
    """
    Short-hand class for a Stack with only data-like processes.
    """

[docs]    def __init__(self, *args, **kwds):
        if 'data_uncertainty' in kwds:
            raise TypeError("Cannot pass 'data_uncertainty' to DataStack. "
                            "Use generic Stack instead.")

        kwds.setdefault("color", "black")
        kwds.setdefault("histtype", "points")
        kwds['data_uncertainty'] = True
        super().__init__(*args, **kwds)

[docs]    def add_process(self, *args, **kwds):
        if 'data_uncertainty' in kwds:
            raise TypeError("Cannot pass 'data_uncertainty' to DataStack. "
                            "Use generic Stack instead.")

        super().add_process(*args, **kwds)