from pathlib import Path

import pandas as pd
import numpy as np
import tensorflow.keras as keras
import sklearn.metrics as metrics
import matplotlib.pyplot as plt

import gm_classifier as gm

/home/cbs51/code/Envs/gm_classifier/lib/python3.6/site-packages/obspy/signal/headers.py:93: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  ], align=True)

# base_dir = Path("/Users/Clus/code/work/gm_classifier/")
base_dir = Path("/home/cbs51//code/gm_classifier/")

shallow_feature_ffp = base_dir / "data/records/ObservedGroundMotions_GMC_features.csv"
shallow_label_ffp = base_dir / "data/records/ObservedGroundMotions_GMC_labels.csv"

shallow_feature_df = pd.read_csv(shallow_feature_ffp, index_col="record_id")
shallow_label_df = pd.read_csv(shallow_label_ffp, index_col="record_id")

print(f"Active shallow feature & label DF shapes")
print(shallow_feature_df.shape)
print(shallow_label_df.shape)

sub_feature_ffp = base_dir / "data/records/sub_features.csv" 
sub_label_ffp = base_dir / "data/records/sub_labels.csv"

sub_feature_df = pd.read_csv(sub_feature_ffp, index_col="record_id")
sub_label_df = pd.read_csv(sub_label_ffp, index_col="record_id")
sub_record_ids = sub_feature_df.index.values.astype(str)

print("\nSubduction feature & label DF shapes")
print(sub_feature_df.shape)
print(sub_label_df.shape)

feature_df = pd.concat([shallow_feature_df, sub_feature_df])
label_df = pd.concat([shallow_label_df, sub_label_df])
label_df.drop(columns=["source_id", "event"], inplace=True)

print("\nCombined shapes")
print(feature_df.shape)
print(label_df.shape)

Active shallow feature & label DF shapes
(12816, 22)
(8467, 3)

Subduction feature & label DF shapes
(125, 22)
(126, 3)

Combined shapes
(12941, 22)
(8593, 2)

Active shallow only¶

Train on active shallow only and see how it performs on subduction

config = {
  "model": {
    "units": [15, 15],
    "act_funcs": "relu",
    "output_act_func": "sigmoid"
  },
  "training": {
    "optimizer": "Adam",
    "loss": "binary_crossentropy",
    "n_epochs": 60,
    "batch_size": 32,
    "dropout": 0.5
  },
  "preprocessing":{
    "deskew": "canterbury_wellington",
    "standardise": True,
    "whiten": True
  }
}

output_dir = base_dir / "gm_classifier/my_stuff/sub/shallow_only"

train_df, history, train_data, val_data = gm.training.run_trainining(
    output_dir, shallow_feature_df, shallow_label_df, config, val_split=0.1, 
    score_th=(0.51, 0.51), record_weight_fn=gm.training.score_weighting, verbose=0)

gm.validation.plot_loss(history, figsize=(16, 12))
print(f"Min-loss: {np.min(history['loss']):.5f}, Min-val-loss: {np.min(history['val_loss']):.5f}")

model = keras.models.load_model(output_dir / "model.h5")

Number of high quality records 4362, low quality records 4050
Number of total labelled records - 8412
Labelled data split into 7570 training and 842 validation samples
WARNING:tensorflow:sample_weight modes were coerced from
  ...
    to  
  ['...']
Min-loss: 0.10520, Min-val-loss: 0.14612

sub_ids = np.intersect1d(sub_feature_df.index.values.astype(str), sub_label_df.index.values.astype(str))

y_train_est = gm.validation.label_from_prob(model.predict(train_data[0]).reshape(-1))
y_val_est = gm.validation.label_from_prob(model.predict(val_data[0]).reshape(-1))
print(f"Training F1: {metrics.f1_score(train_data[1], y_train_est)},\n"
      f"Validation F1: {metrics.f1_score(val_data[1], y_val_est)}")
print()

X_sub = gm.pre.apply(sub_feature_df.loc[sub_ids, gm.features.FEATURE_NAMES].values, "canterbury_wellington",
                     np.load(output_dir / "mu.npy"), np.load(output_dir / "sigma.npy"),
                     np.load(output_dir / "W.npy"))
y_sub, mask = gm.pre.get_label_from_score(sub_label_df.loc[sub_ids].score.values, 0.51, 0.51)
assert np.all(mask)

sub_est = model.predict(X_sub).reshape(-1) 
y_sub_est = gm.validation.label_from_prob(sub_est)
print(f"Subduction F1: {metrics.f1_score(y_sub, y_sub_est)}")

gm.validation.plot_confusion_matrix(y_sub, y_sub_est, title="Sub only - CM")
plt.show()

plt.figure(figsize=(12, 6))
plt.hist(sub_est, bins=20)
plt.title("Histogram of model probabilities")
plt.show()

plt.figure(figsize=(12, 6))
plt.hist(sub_est[y_sub_est != y_sub], bins=20)
plt.title("Histogram of model probabilities for misclassified records")
plt.show()

Training F1: 0.9490657793703609,
Validation F1: 0.9429824561403509

Applying canterbury_wellington deskew
Standardising input data
Whitening input data
Subduction F1: 0.6878980891719745