import os
import tempfile
import unittest
import math
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from nnfwtbn.model import CrossValidator, ClassicalCV, MixedCV, \
Normalizer, EstimatorNormalizer, \
normalize_category_weights, BinaryCV, HepNet, \
NoTestCV
from nnfwtbn.variable import Variable
from nnfwtbn.cut import Cut
import nnfwtbn.toydata as toydata
[docs]class StubCrossValidator(CrossValidator):
[docs] def select_slice(self, df, slice_i):
raise NotImplementedError()
[docs] def select_training(self, df, fold_i):
raise NotImplementedError()
[docs] def select_validation(self, df, fold_i):
raise NotImplementedError()
[docs] def select_test(self, df, fold_i):
raise NotImplementedError()
[docs]class CrossValidatorTestCase(unittest.TestCase):
"""
Test the non-abstract parts of CrossValidator class.
"""
[docs] def test_init_store(self):
"""
Check that the constructor stores all variables passed to it.
"""
stub_cv = StubCrossValidator(5, mod_var="event_number")
self.assertEqual(stub_cv.k, 5)
self.assertIsInstance(stub_cv.variable, Variable)
self.assertTrue(stub_cv.mod_mode)
stub_cv = StubCrossValidator(5, frac_var="mva_random_number")
self.assertEqual(stub_cv.k, 5)
self.assertIsInstance(stub_cv.variable, Variable)
self.assertFalse(stub_cv.mod_mode)
[docs] def test_init_no_variable(self):
"""
Check that an error is raised if no variable object is passed to the
constructor.
"""
self.assertRaises(TypeError, StubCrossValidator)
[docs] def test_init_both_methods(self):
"""
Check that an error is raised if both selection methods are used
constructor.
"""
self.assertRaises(TypeError, StubCrossValidator,
mod_var="event_number", frac_var="mva_random_number")
[docs] def test_equal_same_values(self):
"""
Check the equal operator for cross validators which are created with the same values.
"""
cv1 = StubCrossValidator(4, mod_var="number")
cv2 = StubCrossValidator(4, mod_var="number")
self.assertTrue(cv1 == cv2)
self.assertTrue(cv2 == cv1)
[docs] def test_equal_different_k(self):
"""
Check the equal operator for cross validators with different values for k.
"""
cv1 = StubCrossValidator(4, mod_var="number")
cv2 = StubCrossValidator(5, mod_var="number")
self.assertFalse(cv1 == cv2)
self.assertFalse(cv2 == cv1)
[docs] def test_equal_different_variables(self):
"""
Check the equal operator for cross validators with different variable names.
"""
cv1 = StubCrossValidator(4, mod_var="number1")
cv2 = StubCrossValidator(4, mod_var="number2")
self.assertFalse(cv1 == cv2)
self.assertFalse(cv2 == cv1)
[docs] def test_equal_different_mode(self):
"""
Check the equal operator for cross validators with mod modes.
"""
cv1 = StubCrossValidator(4, mod_var="number")
cv2 = StubCrossValidator(4, frac_var="number")
self.assertFalse(cv1 == cv2)
self.assertFalse(cv2 == cv1)
[docs] def test_equal_different_class(self):
"""
Check the equal operator for cross validators with different types.
"""
cv1 = ClassicalCV(4, mod_var="number")
cv2 = MixedCV(4, mod_var="number")
self.assertFalse(cv1 == cv2)
self.assertFalse(cv2 == cv1)
[docs]class BinaryCVTestCase(unittest.TestCase):
"""
Test the implementation of binary cross validation.
"""
[docs] def generate_df(self):
"""
Generates a toy dataframe.
"""
return pd.DataFrame({
"slice": [0, 0, 0, 0, 1, 1, 1, 1, 0],
"rand": [0.1, 0.125, 0.374, 0.375,
0.550, 32.625, 0.750, 0.999, 0.400],
"number": [0, 0, 0, 0, 1, 1, 1, 1, 0],
})
[docs] def test_slice_mod(self):
"""
Check that all events are sorted into the correct slice.
"""
binary_cv = BinaryCV(mod_var="number")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_slice(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_slice(df, 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_slice_frac(self):
"""
Check that all events are sorted into the correct slice.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_slice(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_slice(df, 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_training_mod(self):
"""
Check that only the training events of each slice are returned.
"""
binary_cv = BinaryCV(mod_var="number")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_training(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_training(df, 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_training_frac(self):
"""
Check that only the training events of each slice are returned.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_training(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_training(df, 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_validation_mod(self):
"""
Check that only the validation events of each slice are returned.
"""
binary_cv = BinaryCV(mod_var="number")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_validation(df, 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(binary_cv.select_validation(df, 1)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_validation_frac(self):
"""
Check that only the validation events of each slice are returned.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_validation(df, 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(binary_cv.select_validation(df, 1)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_test_mod(self):
"""
Check that only the test events of each slice are returned.
"""
binary_cv = BinaryCV(mod_var="number")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_test(df, 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(binary_cv.select_test(df, 1)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_test_frac(self):
"""
Check that only the test events of each slice are returned.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_test(df, 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(binary_cv.select_test(df, 1)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_saving_and_loading(self):
"""
Test that saving and loading a cross validator doesn't change its configuration.
"""
cv1 = BinaryCV(frac_var="rand")
fd, path = tempfile.mkstemp()
try:
cv1.save_to_h5(path, "cv")
cv2 = CrossValidator.load_from_h5(path, "cv")
finally:
# close file descriptor and delete file
os.close(fd)
os.remove(path)
self.assertTrue(cv1 == cv2)
[docs] def test_select_cv_set_training(self):
"""
Check that only the training events of each slice are returned.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_cv_set(df, "train", 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_cv_set(df, "train", 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_select_cv_set_validation(self):
"""
Check that only the validation events of each slice are returned.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_cv_set(df, "val", 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(binary_cv.select_cv_set(df, "val", 1)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_select_cv_set_test(self):
"""
Check that only the test events of each slice are returned.
"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.select_cv_set(df, "test", 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(binary_cv.select_cv_set(df, "test", 1)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_fold_info_training(self):
"""Check that fold info indices are are correct"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.retrieve_fold_info(df, "train")),
[0, 0, 0, 0, 1, 1, 1, 1, 0])
[docs] def test_fold_info_validation(self):
"""Check that fold info indices are are correct"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.retrieve_fold_info(df, "val")),
[1, 1, 1, 1, 0, 0, 0, 0, 1])
[docs] def test_fold_info_test(self):
"""Check that fold info indices are are correct"""
binary_cv = BinaryCV(frac_var="rand")
df = self.generate_df()
self.assertEqual(list(binary_cv.retrieve_fold_info(df, "test")),
[1, 1, 1, 1, 0, 0, 0, 0, 1])
[docs]class ClassicalCVTestCase(unittest.TestCase):
"""
Test the implementation of classical cross validation with k=4.
"""
[docs] def generate_df(self):
"""
Generates a toy dataframe.
"""
return pd.DataFrame({
"slice": [0, 1, 2, 3, 4, 5, 6, 7, 3],
"rand": [0.1, 0.125, 0.374, 0.375,
0.550, 32.625, 0.750, 0.999, 0.400],
"number": [8, 9, 2, 27, 20, 5, 30, 31, 11],
})
[docs] def test_slice_mod(self):
"""
Check that all events are sorted into the correct slice.
"""
classical_cv = ClassicalCV(4, mod_var="number")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_slice(df, 0)),
[True, False, False, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 1)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 2)),
[False, False, True, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 3)),
[False, False, False, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_slice(df, 4)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 5)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 6)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 7)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_slice_frac(self):
"""
Check that all events are sorted into the correct slice.
"""
classical_cv = ClassicalCV(4, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_slice(df, 0)),
[True, False, False, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 1)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 2)),
[False, False, True, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 3)),
[False, False, False, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_slice(df, 4)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 5)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 6)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(classical_cv.select_slice(df, 7)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_training_mod(self):
"""
Check that only the training events of each slice are returned.
"""
classical_cv = ClassicalCV(4, mod_var="number")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_training(df, 0)),
[True, True, True, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_training(df, 1)),
[True, True, False, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_training(df, 2)),
[True, False, True, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_training(df, 3)),
[False, True, True, True, False, False, False, False, True])
[docs] def test_training_frac(self):
"""
Check that only the training events of each slice are returned.
"""
classical_cv = ClassicalCV(4, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_training(df, 0)),
[True, True, True, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_training(df, 1)),
[True, True, False, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_training(df, 2)),
[True, False, True, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_training(df, 3)),
[False, True, True, True, False, False, False, False, True])
[docs] def test_validation_mod(self):
"""
Check that only the validation events of each slice are returned.
"""
classical_cv = ClassicalCV(4, mod_var="number")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_validation(df, 0)),
[False, False, False, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_validation(df, 1)),
[False, False, True, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_validation(df, 2)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_validation(df, 3)),
[True, False, False, False, False, False, False, False, False])
[docs] def test_validation_frac(self):
"""
Check that only the validation events of each slice are returned.
"""
classical_cv = ClassicalCV(4, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_validation(df, 0)),
[False, False, False, True, False, False, False, False, True])
self.assertEqual(list(classical_cv.select_validation(df, 1)),
[False, False, True, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_validation(df, 2)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(classical_cv.select_validation(df, 3)),
[True, False, False, False, False, False, False, False, False])
[docs] def test_test_mod(self):
"""
Check that only the test events of each slice are returned.
"""
classical_cv = ClassicalCV(4, mod_var="number")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_test(df, 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(classical_cv.select_test(df, 1)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(classical_cv.select_test(df, 2)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(classical_cv.select_test(df, 3)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_test_frac(self):
"""
Check that only the test events of each slice are returned.
"""
classical_cv = ClassicalCV(4, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(classical_cv.select_test(df, 0)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(classical_cv.select_test(df, 1)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(classical_cv.select_test(df, 2)),
[False, False, False, False, True, True, True, True, False])
self.assertEqual(list(classical_cv.select_test(df, 3)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_saving_and_loading(self):
"""
Test that saving and loading a cross validator doesn't change its configuration.
"""
cv1 = ClassicalCV(4, frac_var="rand")
fd, path = tempfile.mkstemp()
try:
cv1.save_to_h5(path, "cv")
cv2 = CrossValidator.load_from_h5(path, "cv")
finally:
# close file descriptor and delete file
os.close(fd)
os.remove(path)
self.assertTrue(cv1 == cv2)
[docs]class MixedCVTestCase(unittest.TestCase):
"""
Test the implementation of mixed cross validation with k=8.
"""
[docs] def generate_df(self):
"""
Generates a toy dataframe.
"""
return pd.DataFrame({
"slice": [0, 1, 2, 3, 4, 5, 6, 7, 2],
"rand": [0.1, 0.125, 0.374, 0.375,
0.550, 32.625, 0.750, 0.999, 0.350],
"number": [8, 9, 2, 27, 20, 5, 30, 31, 10],
})
[docs] def test_slice_mod(self):
"""
Check that all events are sorted into the correct slice.
"""
mixed_cv = MixedCV(8, mod_var="number")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_slice(df, 0)),
[True, False, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 1)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 2)),
[False, False, True, False, False, False, False, False, True])
self.assertEqual(list(mixed_cv.select_slice(df, 3)),
[False, False, False, True, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 4)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 5)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 6)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 7)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_slice_frac(self):
"""
Check that all events are sorted into the correct slice.
"""
mixed_cv = MixedCV(8, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_slice(df, 0)),
[True, False, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 1)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 2)),
[False, False, True, False, False, False, False, False, True])
self.assertEqual(list(mixed_cv.select_slice(df, 3)),
[False, False, False, True, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 4)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 5)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 6)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(mixed_cv.select_slice(df, 7)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_training_mod(self):
"""
Check that only the training events of each slice are returned.
"""
mixed_cv = MixedCV(8, mod_var="number")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_training(df, 0)),
[True, True, True, True, True, True, False, False, True])
self.assertEqual(list(mixed_cv.select_training(df, 1)),
[True, True, True, True, True, False, False, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 2)),
[True, True, True, True, False, False, True, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 3)),
[True, True, True, False, False, True, True, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 4)),
[True, True, False, False, True, True, True, True, False])
self.assertEqual(list(mixed_cv.select_training(df, 5)),
[True, False, False, True, True, True, True, True, False])
self.assertEqual(list(mixed_cv.select_training(df, 6)),
[False, False, True, True, True, True, True, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 7)),
[False, True, True, True, True, True, True, False, True])
[docs] def test_training_frac(self):
"""
Check that only the training events of each slice are returned.
"""
mixed_cv = MixedCV(8, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_training(df, 0)),
[True, True, True, True, True, True, False, False, True])
self.assertEqual(list(mixed_cv.select_training(df, 1)),
[True, True, True, True, True, False, False, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 2)),
[True, True, True, True, False, False, True, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 3)),
[True, True, True, False, False, True, True, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 4)),
[True, True, False, False, True, True, True, True, False])
self.assertEqual(list(mixed_cv.select_training(df, 5)),
[True, False, False, True, True, True, True, True, False])
self.assertEqual(list(mixed_cv.select_training(df, 6)),
[False, False, True, True, True, True, True, True, True])
self.assertEqual(list(mixed_cv.select_training(df, 7)),
[False, True, True, True, True, True, True, False, True])
[docs] def test_validation_mod(self):
"""
Check that only the validation events of each slice are returned.
"""
mixed_cv = MixedCV(8, mod_var="number")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_validation(df, 0)),
[False, False, False, False, False, False, False, True, False])
self.assertEqual(list(mixed_cv.select_validation(df, 1)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 2)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 3)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 4)),
[False, False, False, True, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 5)),
[False, False, True, False, False, False, False, False, True])
self.assertEqual(list(mixed_cv.select_validation(df, 6)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 7)),
[True, False, False, False, False, False, False, False, False])
[docs] def test_validation_frac(self):
"""
Check that only the validation events of each slice are returned.
"""
mixed_cv = MixedCV(8, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_validation(df, 0)),
[False, False, False, False, False, False, False, True, False])
self.assertEqual(list(mixed_cv.select_validation(df, 1)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 2)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 3)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 4)),
[False, False, False, True, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 5)),
[False, False, True, False, False, False, False, False, True])
self.assertEqual(list(mixed_cv.select_validation(df, 6)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_validation(df, 7)),
[True, False, False, False, False, False, False, False, False])
[docs] def test_test_mod(self):
"""
Check that only the test events of each slice are returned.
"""
mixed_cv = MixedCV(8, mod_var="number")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_test(df, 0)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 1)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 2)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 3)),
[False, False, False, True, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 4)),
[False, False, True, False, False, False, False, False, True])
self.assertEqual(list(mixed_cv.select_test(df, 5)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 6)),
[True, False, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 7)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_test_frac(self):
"""
Check that only the test events of each slice are returned.
"""
mixed_cv = MixedCV(8, frac_var="rand")
df = self.generate_df()
self.assertEqual(list(mixed_cv.select_test(df, 0)),
[False, False, False, False, False, False, True, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 1)),
[False, False, False, False, False, True, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 2)),
[False, False, False, False, True, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 3)),
[False, False, False, True, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 4)),
[False, False, True, False, False, False, False, False, True])
self.assertEqual(list(mixed_cv.select_test(df, 5)),
[False, True, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 6)),
[True, False, False, False, False, False, False, False, False])
self.assertEqual(list(mixed_cv.select_test(df, 7)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_saving_and_loading(self):
"""
Test that saving and loading a cross validator doesn't change its configuration.
"""
cv1 = MixedCV(8, frac_var="rand")
fd, path = tempfile.mkstemp()
try:
cv1.save_to_h5(path, "cv")
cv2 = CrossValidator.load_from_h5(path, "cv")
finally:
# close file descriptor and delete file
os.close(fd)
os.remove(path)
self.assertTrue(cv1 == cv2)
[docs]class EstimatorNormalizerTestCase(unittest.TestCase):
"""
Test the implementation of EstimatorNormalizer.
"""
[docs] def generate_df(self):
"""
Generate toy dataframe.
"""
return pd.DataFrame({
"x": [9, 10, 10, 12, 12, 13],
"y": [0, 1, 1, 1, 1, 2],
"z": [0, 0, 0, 0, 0, 0], # Column with zero width
})
[docs] def generate_test_df(self):
"""
Generate toy dataframe used to test the normalization.
"""
return pd.DataFrame({
"x": [6, 11, 16],
"y": [-1, 1, 3],
"z": [-1, 0, 1],
})
[docs] def test_init(self):
"""
Check that the constructor computes the normalization moments of all
columns in the given dataframe if no input_list is given.
"""
df = self.generate_df()
norm = EstimatorNormalizer(df)
self.assertEqual(len(norm.center), 3)
self.assertAlmostEqual(norm.center["x"], 11)
self.assertAlmostEqual(norm.center["y"], 1)
self.assertAlmostEqual(norm.center["z"], 0)
self.assertEqual(len(norm.width), 3)
self.assertAlmostEqual(norm.width["x"], math.sqrt(12 / 5))
self.assertAlmostEqual(norm.width["y"], math.sqrt(2 / 5))
self.assertAlmostEqual(norm.width["z"], 1)
[docs] def test_call(self):
"""
Check that the normalization moments are applied to the given
dataframe.
"""
df = self.generate_df()
norm = EstimatorNormalizer(df)
normed = norm(self.generate_test_df())
self.assertEqual(list(normed.x),
[-5 / math.sqrt(12 / 5), 0, 5 / math.sqrt(12 / 5)])
self.assertEqual(list(normed.y),
[-2 / math.sqrt(2 / 5), 0, 2 / math.sqrt(2 / 5)])
self.assertEqual(list(normed.z),
[-1, 0, 1])
[docs] def test_call_other_vars(self):
"""
Check that columns in the dataframe are left untouched if moments are
missing.
"""
df = self.generate_df()
norm = EstimatorNormalizer(df, input_list=["x", "z"])
normed = norm(self.generate_test_df())
self.assertEqual(list(normed.x),
[-5 / math.sqrt(12 / 5), 0, 5 / math.sqrt(12 / 5)])
self.assertEqual(list(normed.y),
[-1, 1, 3])
self.assertEqual(list(normed.z),
[-1, 0, 1])
[docs] def test_equal_same_values(self):
df = self.generate_df()
norm1 = EstimatorNormalizer(df)
norm2 = EstimatorNormalizer(df)
self.assertTrue(norm1 == norm2)
[docs] def test_saving_and_loading(self):
"""
Test that saving and loading a estimator normalizer doesn't change its configuration.
"""
df = self.generate_df()
norm1 = EstimatorNormalizer(df)
fd, path = tempfile.mkstemp()
try:
norm1.save_to_h5(path, "norm")
norm2 = Normalizer.load_from_h5(path, "norm")
finally:
# close file descriptor and delete file
os.close(fd)
os.remove(path)
self.assertTrue(norm1 == norm2)
[docs]class CategoricalWeightNormalizerTestCase(unittest.TestCase):
"""
Test the implementation of normalize_category_weights.
"""
[docs] def generate_df(self):
"""
Generate toy dataframe.
"""
return pd.DataFrame({
"x": [9, 10, 10, 12, 12, 13],
"weight": [0.1, 0.2, 0.3, 1.4, 1.8, 1],
"alt_weight": [1.1, 1.2, 1.3, 2.4, 2.8, 2],
"fpid": [1, 2, 1, 2, 1, 3],
})
[docs] def test_alternative_weight(self):
"""
Check that the constructor normalized the classes using an alternative
weight variables.
"""
df = self.generate_df()
categories = [Cut(lambda d: d.fpid == 1),
Cut(lambda d: d.fpid == 2),
Cut(lambda d: d.fpid == 3)]
df = normalize_category_weights(df, categories,
weight='alt_weight')
c1, c2, c3 = categories
self.assertAlmostEqual(df.alt_weight.sum(), len(df))
self.assertAlmostEqual(c1(df).alt_weight.sum(), 2)
self.assertAlmostEqual(c2(df).alt_weight.sum(), 2)
self.assertAlmostEqual(c3(df).alt_weight.sum(), 2)
[docs] def test_main(self):
"""
Check that the constructor normalized the classes.
"""
df = self.generate_df()
categories = [Cut(lambda d: d.fpid == 1),
Cut(lambda d: d.fpid == 2),
Cut(lambda d: d.fpid == 3)]
df = normalize_category_weights(df, categories)
c1, c2, c3 = categories
self.assertAlmostEqual(df.weight.sum(), len(df))
self.assertAlmostEqual(c1(df).weight.sum(), 2)
self.assertAlmostEqual(c2(df).weight.sum(), 2)
self.assertAlmostEqual(c3(df).weight.sum(), 2)
self.assertAlmostEqual(df.weight[2] / df.weight[0], 3)
[docs]class HepNetTestCase(unittest.TestCase):
[docs] def test_saving_and_loading(self):
"""
Test that saving and loading a neural network doesn't change its configuration.
"""
input_var = ['m_jj', 'higgs_pt', 'jet_2_pt', 'jet_1_eta', 'jet_2_eta',
'tau_eta']
output_var = ['is_sig', 'is_ztt']
def model():
m = Sequential()
m.add(Dense(units=12, activation='relu',
input_dim=len(input_var)))
m.add(Dense(units=6, activation='relu'))
m.add(Dense(units=len(output_var), activation='softmax'))
m.compile(loss='categorical_crossentropy',
optimizer=SGD(lr=0.1),
metrics=['categorical_accuracy'])
return m
cv = ClassicalCV(k=3, frac_var="random")
net = HepNet(model, cv, EstimatorNormalizer, input_var, output_var)
df = toydata.get()
df["is_sig"] = (df.fpid == 1)
df["is_ztt"] = (df.fpid == 0)
net.fit(df.compute(), epochs=5, verbose=0,
weight=Variable("weight", "weight"))
fd, path = tempfile.mkstemp()
try:
net.save(path)
net_loaded = HepNet.load(path)
finally:
# close file descriptor and delete file
os.close(fd)
os.remove(path)
self.assertTrue("epoch" in net_loaded.history)
self.assertTrue("loss" in net_loaded.history)
self.assertTrue("val_loss" in net_loaded.history)
self.assertTrue("categorical_accuracy" in net_loaded.history)
self.assertTrue("val_categorical_accuracy" in net_loaded.history)
self.assertTrue(net == net_loaded)
[docs]class NoTestCVTestCase(unittest.TestCase):
"""
Test the implementation of no-test cross validation.
"""
[docs] def generate_df(self):
"""
Generates a toy dataframe.
"""
return pd.DataFrame({
"slice": [0, 0, 0, 0, 1, 1, 1, 1, 0],
"rand": [0.1, 0.125, 0.374, 0.375,
0.550, 32.625, 0.750, 0.999, 0.400],
"number": [0, 0, 0, 0, 1, 1, 1, 3, 0],
})
[docs] def test_slice_mod(self):
"""
Check that all events are sorted into the correct slice.
"""
binary_cv = NoTestCV(mod_var="number", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_slice(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_slice(df, 1)),
[False, False, False, False, True, True, True, False, False])
self.assertEqual(list(binary_cv.select_slice(df, 3)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_slice_frac(self):
"""
Check that all events are sorted into the correct slice.
"""
binary_cv = NoTestCV(frac_var="rand", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_slice(df, 0)),
[True, True, False, False, False, False, False, False, False])
self.assertEqual(list(binary_cv.select_slice(df, 1)),
[False, False, True, True, False, False, False, False, False])
self.assertEqual(list(binary_cv.select_slice(df, 4)),
[False, False, False, False, False, False, False, True, False])
[docs] def test_slice_mod_k2(self):
"""
Check that all events are sorted into the correct slice.
"""
binary_cv = NoTestCV(mod_var="number", k=2)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_slice(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_slice(df, 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_slice_frac_k2(self):
"""
Check that all events are sorted into the correct slice.
"""
binary_cv = NoTestCV(frac_var="rand", k=2)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_slice(df, 0)),
[True, True, True, True, False, False, False, False, True])
self.assertEqual(list(binary_cv.select_slice(df, 1)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_training_mod(self):
"""
Check that only the training events of each slice are returned.
"""
binary_cv = NoTestCV(mod_var="number", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_training(df, 0)),
[False, False, False, False, True, True, True, True, False])
[docs] def test_training_frac(self):
"""
Check that only the training events of each slice are returned.
"""
binary_cv = NoTestCV(frac_var="rand", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_training(df, 0)),
[False, False, True, True, True, True, True, True, True])
[docs] def test_validation_mod(self):
"""
Check that only the validation events of each slice are returned.
"""
binary_cv = NoTestCV(mod_var="number", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_validation(df, 0)),
[True, True, True, True, False, False, False, False, True])
[docs] def test_validation_frac(self):
"""
Check that only the validation events of each slice are returned.
"""
binary_cv = NoTestCV(frac_var="rand", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_validation(df, 0)),
[True, True, False, False, False, False, False, False, False])
[docs] def test_test_mod(self):
"""
Check that only the test events of each slice are returned.
"""
binary_cv = NoTestCV(mod_var="number", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_test(df, 0)),
[False, False, False, False, False, False, False, False, False])
[docs] def test_test_frac(self):
"""
Check that only the test events of each slice are returned.
"""
binary_cv = NoTestCV(frac_var="rand", k=5)
df = self.generate_df()
self.assertEqual(list(binary_cv.select_test(df, 0)),
[False, False, False, False, False, False, False, False, False])
[docs] def test_saving_and_loading(self):
"""
Test that saving and loading a cross validator doesn't change its configuration.
"""
cv1 = NoTestCV(frac_var="rand")
fd, path = tempfile.mkstemp()
try:
cv1.save_to_h5(path, "cv")
cv2 = CrossValidator.load_from_h5(path, "cv")
finally:
# close file descriptor and delete file
os.close(fd)
os.remove(path)
self.assertTrue(cv1 == cv2)