Commit 9ae872e7 authored by Benjamin Schwenker's avatar Benjamin Schwenker
Browse files

Merge branch '31-include-and-test-new-sextuple-magnet-pvs' into 'master'

added function to remove noisy optics PVs

Closes #31

See merge request !40
parents 8003d6b1 cd8e0457
......@@ -74,7 +74,7 @@ def get_tensors_no_scaling(
return (x, y)
def cleanup_mask(df, obs, current_threshold):
def cleanup_mask(df, obs, current_threshold, holdout_times=[]):
"""
Returns a binary mask for rows in DataFrame df usable for training of the
lumi model.
......@@ -114,6 +114,9 @@ def cleanup_mask(df, obs, current_threshold):
# Remove data where lumi too low
mask = mask & (df["ECL_LUM_MON_ACC"] > 100)
# Remove data from holdout time intervals
mask = mask & bgnet.mask_masked_times(df, holdout_times)
return mask
......@@ -229,6 +232,7 @@ def load_data_from_path(
data,
obs=config['obs'],
current_threshold=config['preparation']['current_threshold'],
holdout_times=holdout_times,
)
data = data[mask]
......@@ -246,7 +250,6 @@ def load_data_from_path(
obs = config['obs']
pvs = config['preparation']['lumi_pvs']
# Build bgnet tensors from data
tensors_from_data = get_tensors_no_scaling(
data,
......@@ -294,6 +297,26 @@ def load_data(
holdout_times=holdout_times)
def remove_bad_features(x, index, config, optic_pvs):
df = pd.DataFrame(data=x, index=index, columns=config['preparation']['lumi_pvs'])
const_PVs = df.columns[df.nunique() <= 1]
df = df.loc[:, df.apply(pd.Series.nunique) > 1]
config['preparation']['lumi_pvs'] = [PV for PV in config['preparation']['lumi_pvs'] if PV not in const_PVs]
noisy_optics_PVs = []
for PV in config['preparation']['lumi_pvs']:
if PV in optic_pvs:
long_const_mask = bgnet.mask_long_const(
df,
column=PV,
min_length=48,
extend_left=0,
extend_right=0)
if np.count_nonzero(long_const_mask)/len(long_const_mask) > 0.2:
noisy_optics_PVs = np.append(noisy_optics_PVs, PV)
config['preparation']['lumi_pvs'] = [PV for PV in config['preparation']['lumi_pvs'] if PV not in noisy_optics_PVs]
df = df[config['preparation']['lumi_pvs']]
return df.to_numpy()
def train_scalers_on_preprocessed_data(x, y, q1=10.0, q2=90.0):
"""
Returns of fitted scalers from tuple of inputs x and target y.
......
%% Cell type:code id: tags:
``` python
# BGNet (Background prediction with neural nets for Belle II at SuperKEKB)
# Author: The BGNet developers
#
# See git log for contributors and copyright holders.
# This file is licensed under MIT licence, see LICENSE.md.
# Luminosity: Jupyter notebook for Belle II luminosity studies
#
# This notebooks trains a model that predicts the luminosity and computes feature attributions
# that give insight into the reasons for changes of the beam life
#
# This notebook is part of bg_net project: https://gitlab.desy.de/benjamin.schwenker/bg_net.git
# Access to data can be provided by author on reasonable request.
#
# This notebook uses the path_explain project (https://github.com/suinleelab/path_explain) to compute feature
# attributions and plot them.
```
%% Cell type:markdown id: tags:
# Imports
%% Cell type:code id: tags:
``` python
import yaml
import os
from path_explain import PathExplainerTF, summary_plot
from sklearn.model_selection import train_test_split
import bg_net.lumi as lumi
import bg_net.bgnet as bgnet
import bg_net.run_training as run_training
import bg_net.report_utils
from bg_net.archiver.indexer_utils import str_to_jst_datetime
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import shutil
from pathlib import Path
from bg_net.archiver.skb_mr_optic_pvs import SKB2_MR_OPTICS_HER_PVS as SKB2_MR_OPTICS_HER_PVS
from bg_net.archiver.skb_mr_optic_pvs import SKB2_MR_OPTICS_LER_PVS as SKB2_MR_OPTICS_LER_PVS
```
%% Cell type:markdown id: tags:
# Loading the Data
%% Cell type:code id: tags:
``` python
# Set a single observable for optimization
obs = lumi.OBS
obs_unit = lumi.OBS_UNIT
# Set data path
data_path = os.path.expandvars("${B2_ARCHIVER}")
# Time interval to look at
start = str_to_jst_datetime("{:04d}-{:02d}-{:02d} 00:00:00".format(2022, 4, 4))
end = str_to_jst_datetime("{:04d}-{:02d}-{:02d} 23:59:59".format(2022, 4, 5))
start = str_to_jst_datetime("{:04d}-{:02d}-{:02d} 00:00:00".format(2022, 3, 1))
end = str_to_jst_datetime("{:04d}-{:02d}-{:02d} 23:59:59".format(2022, 3, 31))
# Get a default config file
with open('configs/training_lumi.yaml') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
# Make format conversions (str-> timestamps)
config = lumi.convert_config(config)
# Setup config for training
config = run_training.create_train_config(config, obs=obs, data_path=data_path, local_dir='./lumi_results')
config = run_training.create_train_config(config, obs=obs, data_path=data_path, times=[start,end], local_dir='./lumi_results')
# Add optic PVs to config
optic_pvs= [PV.replace(':','_') for PV in SKB2_MR_OPTICS_HER_PVS] + [PV.replace(':','_') for PV in SKB2_MR_OPTICS_LER_PVS]
config['preparation']['lumi_pvs'] = config['preparation']['lumi_pvs'] + optic_pvs
```
%% Cell type:code id: tags:
``` python
# Load the data
x, y, index = lumi.load_data(config, start_time=start, end_time=end)
```
%% Cell type:code id: tags:
``` python
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
```
%% Cell type:code id: tags:
``` python
scaler_x, scaler_y = lumi.train_scalers_on_preprocessed_data(x_train, y_train)
```
%% Cell type:code id: tags:
``` python
config['scalers']=(scaler_x, scaler_y)
```
%% Cell type:code id: tags:
``` python
x_train_scaled = scaler_x.transform(x_train)
y_train_scaled = scaler_y.transform(y_train)
x_test_scaled = scaler_x.transform(x_test)
y_test_scaled = scaler_y.transform(y_test)
x_scaled = scaler_x.transform(x)
y_scaled = scaler_y.transform(y)
```
%% Cell type:markdown id: tags:
# Create the Model
%% Cell type:code id: tags:
``` python
model = lumi.get_model(config)
```
%% Cell type:markdown id: tags:
# Training the Model
%% Cell type:code id: tags:
``` python
callbacks = bgnet.setup_default_callbacks(config)
# Run training procedure
model = lumi.run_training(
model,
config=config,
callbacks=callbacks,
x=x_train_scaled,
y=y_train_scaled,
verbose=2
)
```
%% Cell type:markdown id: tags:
# Validate the training
%% Cell type:code id: tags:
``` python
train_loss, train_mse, train_mae = model.evaluate(x_train_scaled, y_train_scaled, batch_size=64, verbose=0)
test_loss, test_mse, test_mae = model.evaluate(x_test_scaled, y_test_scaled, batch_size=64, verbose=0)
```
%% Cell type:code id: tags:
``` python
print('Train MSE: {:.4f}\tTrain MAE: {:.4f}'.format(train_mse, train_mae))
print('Test MSE: {:.4f}\tTest MAE: {:.4f}'.format(test_mse, test_mae))
```
%% Cell type:code id: tags:
``` python
y_test_pred_scaled = model.predict(x_test_scaled)
y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled)
```
%% Cell type:code id: tags:
``` python
import seaborn as sns
ax = sns.regplot(x=y_test_pred, y=y_test)
ax.set_xlabel("Predicted {} on test data ".format(obs), fontsize = 10)
ax.set_ylabel("Measured {} on test data ".format(obs), fontsize = 10)
```
%% Cell type:markdown id: tags:
# Predict and plot an all data
%% Cell type:code id: tags:
``` python
y_pred_scaled = model.predict(x_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
```
%% Cell type:code id: tags:
``` python
fig = go.Figure()
fig.add_trace(go.Scatter(x=index, y=y[:, 0], name='Observed', mode='markers'))
fig.add_trace(go.Scatter(x=index, y=y_pred[:, 0], name='Predicted', mode='markers'))
fig.show()
```
%% Cell type:markdown id: tags:
# Interpreting the Model
%% Cell type:markdown id: tags:
Path explain API requires a test and a reference dataset. It then calculates an attribution value for each
input at each point in the test dataset. These attribution values reflect how important that input is for the
difference between that point and the reference dataset.
%% Cell type:code id: tags:
``` python
# Define reference and test times for path explain. Each dataset reqires at least one timeframe, more are possible.
reference_data_times = (
['2022-04-04 20:10:00+0900', '2022-04-04 21:10:00+0900'],
['2022-03-06 22:00:00+0900', '2022-03-07 22:00:00+0900'],
)
test_data_times = (
['2022-04-04 21:12:00+0900', '2022-04-04 22:18:00+0900'],
['2022-03-07 23:00:00+0900', '2022-03-08 23:00:00+0900'],
)
```
%% Cell type:code id: tags:
``` python
reference_dataset, test_dataset = lumi.get_pathexplain_datasets(
x,
index,
reference_data_times,
test_data_times
)
reference_dataset_scaled = scaler_x.transform(reference_dataset)
test_dataset_scaled = scaler_x.transform(test_dataset)
```
%% Cell type:code id: tags:
``` python
feature_names = lumi.get_feature_names(config, obs=obs)
```
%% Cell type:markdown id: tags:
The next step is to define our explainer object.
%% Cell type:code id: tags:
``` python
explainer = PathExplainerTF(model)
```
%% Cell type:markdown id: tags:
Now we actually have to run the attribution function. Here we used expected gradients because the argument use_expectation is equal to true and the baseline is set to be the reference data. However, we could set use_expectation and the baseline to be the zeros vector to use integrated gradients instead.
%% Cell type:code id: tags:
``` python
attributions = explainer.attributions(inputs=test_dataset_scaled.astype('float32'),
baseline=reference_dataset_scaled.astype('float32'),
batch_size=100,
num_samples=200,
use_expectation=True,
output_indices=0,
verbose=True)
# To have attributions in unit of y
attributions = scaler_y.inverse_transform(attributions)
```
%% Cell type:code id: tags:
``` python
plot_top_k = 20
if len(feature_names) < plot_top_k:
plot_top_k = len(feature_names)
summary_plot(attributions,
test_dataset,
feature_names=feature_names,
plot_top_k=plot_top_k,
figsize=[8,4]
)
#plt.savefig("./lumi_plots/random_attribution_no_emitt.pdf", bbox_inches='tight')
```
%% Cell type:markdown id: tags:
# Reporting
%% Cell type:code id: tags:
``` python
shutil.rmtree("./lumi_plots", ignore_errors=True)
Path("./lumi_plots").mkdir(parents=True, exist_ok=True)
bg_net.report_utils.make_default_plots(
obs=config['obs'],
index=index,
x=x,
y=y,
attributions=attributions,
feature_names=config['preparation']['lumi_pvs'],
reference_data_times=reference_data_times,
test_data_times=test_data_times,
save_path="./lumi_plots",
y_pred=y_pred,
test_dataset=test_dataset,
obs_unit=obs_unit,
show=True,
look_back=2,
look_back=7,
freq=config['preparation']['freq'],
)
```
%% Cell type:code id: tags:
``` python
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment