from pathlib import Path
import pandas as pd
import numpy as np
import tensorflow.keras as keras
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import gm_classifier as gm
# base_dir = Path("/Users/Clus/code/work/gm_classifier/")
base_dir = Path("/home/cbs51//code/gm_classifier/")
shallow_feature_ffp = base_dir / "data/records/ObservedGroundMotions_GMC_features.csv"
shallow_label_ffp = base_dir / "data/records/ObservedGroundMotions_GMC_labels.csv"
shallow_feature_df = pd.read_csv(shallow_feature_ffp, index_col="record_id")
shallow_label_df = pd.read_csv(shallow_label_ffp, index_col="record_id")
print(f"Active shallow feature & label DF shapes")
print(shallow_feature_df.shape)
print(shallow_label_df.shape)
sub_feature_ffp = base_dir / "data/records/sub_features.csv"
sub_label_ffp = base_dir / "data/records/sub_labels.csv"
sub_feature_df = pd.read_csv(sub_feature_ffp, index_col="record_id")
sub_label_df = pd.read_csv(sub_label_ffp, index_col="record_id")
sub_record_ids = sub_feature_df.index.values.astype(str)
print("\nSubduction feature & label DF shapes")
print(sub_feature_df.shape)
print(sub_label_df.shape)
feature_df = pd.concat([shallow_feature_df, sub_feature_df])
label_df = pd.concat([shallow_label_df, sub_label_df])
label_df.drop(columns=["source_id", "event"], inplace=True)
print("\nCombined shapes")
print(feature_df.shape)
print(label_df.shape)
Train on active shallow only and see how it performs on subduction
config = {
"model": {
"units": [15, 15],
"act_funcs": "relu",
"output_act_func": "sigmoid"
},
"training": {
"optimizer": "Adam",
"loss": "binary_crossentropy",
"n_epochs": 60,
"batch_size": 32,
"dropout": 0.5
},
"preprocessing":{
"deskew": "canterbury_wellington",
"standardise": True,
"whiten": True
}
}
output_dir = base_dir / "gm_classifier/my_stuff/sub/shallow_only"
train_df, history, train_data, val_data = gm.training.run_trainining(
output_dir, shallow_feature_df, shallow_label_df, config, val_split=0.1,
score_th=(0.51, 0.51), record_weight_fn=gm.training.score_weighting, verbose=0)
gm.validation.plot_loss(history, figsize=(16, 12))
print(f"Min-loss: {np.min(history['loss']):.5f}, Min-val-loss: {np.min(history['val_loss']):.5f}")
model = keras.models.load_model(output_dir / "model.h5")
sub_ids = np.intersect1d(sub_feature_df.index.values.astype(str), sub_label_df.index.values.astype(str))
y_train_est = gm.validation.label_from_prob(model.predict(train_data[0]).reshape(-1))
y_val_est = gm.validation.label_from_prob(model.predict(val_data[0]).reshape(-1))
print(f"Training F1: {metrics.f1_score(train_data[1], y_train_est)},\n"
f"Validation F1: {metrics.f1_score(val_data[1], y_val_est)}")
print()
X_sub = gm.pre.apply(sub_feature_df.loc[sub_ids, gm.features.FEATURE_NAMES].values, "canterbury_wellington",
np.load(output_dir / "mu.npy"), np.load(output_dir / "sigma.npy"),
np.load(output_dir / "W.npy"))
y_sub, mask = gm.pre.get_label_from_score(sub_label_df.loc[sub_ids].score.values, 0.51, 0.51)
assert np.all(mask)
sub_est = model.predict(X_sub).reshape(-1)
y_sub_est = gm.validation.label_from_prob(sub_est)
print(f"Subduction F1: {metrics.f1_score(y_sub, y_sub_est)}")
gm.validation.plot_confusion_matrix(y_sub, y_sub_est, title="Sub only - CM")
plt.show()
plt.figure(figsize=(12, 6))
plt.hist(sub_est, bins=20)
plt.title("Histogram of model probabilities")
plt.show()
plt.figure(figsize=(12, 6))
plt.hist(sub_est[y_sub_est != y_sub], bins=20)
plt.title("Histogram of model probabilities for misclassified records")
plt.show()