Enhancing-Drug-Safety-using-Explainable-AI/app.py at main · arkanivasarkar/Enhancing-Drug-Safety-using-Explainable-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st
from streamlit_extras.let_it_rain import rain
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
)
import matplotlib.pyplot as plt

# Initialize Streamlit page
st.set_page_config(page_title="ExplainablePredictor", layout="wide")
rain(emoji="💊", font_size=54, falling_speed=5, animation_length=0.5)

# Sidebar for data upload
st.sidebar.title("⚕️ Explainable Drug Safety Predictor")
st.sidebar.caption("Upload Data for Training")
uploaded_file = st.sidebar.file_uploader("Upload Data", type=[".csv", ".json"])

# Helper Functions
def preprocess_data(data, targets):
    """Preprocess input data by encoding categorical variables and scaling."""
    encoder = LabelEncoder()
    scaler = StandardScaler()

    data["Ethnicity"] = encoder.fit_transform(data["Ethnicity"])
    data["Gender"] = encoder.fit_transform(data["Gender"])

    features = data.drop(columns=targets)
    return features, scaler

def train_model(features, target, params):
    """Train a RandomForest model with oversampling."""
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(features, target)

    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, stratify=y_resampled, test_size=0.3, random_state=42
    )

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = RandomForestClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return model, X_test, y_test, y_pred

def display_results(tab, y_test, y_pred, model, features):
    """Display model performance metrics and feature importance."""
    with tab:
        st.write(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
        st.write(f"ROC AUC: {roc_auc_score(y_test, y_pred, multi_class='ovr'):.2f}")
        st.write(f"F1 Score: {f1_score(y_test, y_pred, average='macro'):.2f}")
        st.write(f"Precision: {precision_score(y_test, y_pred, average='macro'):.2f}")
        st.write(f"Recall: {recall_score(y_test, y_pred, average='macro'):.2f}")

def display_feature_importance(tab, model, feature_names):
    """Display feature importance in a bar chart."""
    with tab:
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        st.write("Feature ranking:")
        for i in range(min(5, len(indices))):
            st.write(f"{i + 1}. {feature_names[indices[i]]} ({importances[indices[i]]:.4f})")

        fig, ax = plt.subplots(figsize=(8, 8))
        plt.bar(range(len(feature_names)), importances[indices], align="center")
        plt.xticks(range(len(feature_names)), indices)
        plt.xlabel("Feature Index")
        plt.ylabel("Importance Score")
        st.pyplot(fig)

# Main logic
if uploaded_file is not None:
    data = pd.read_csv(uploaded_file)
    st.write("DATA:")
    st.write(data)

    target_labels = ["Dizziness", "Fatigue", "Hypoglycemia", "Palpitations",
                     "Confusion", "Fainting", "Severity"]
    features, scaler = preprocess_data(data, target_labels)

    # Parameters for RandomForestClassifier
    rf_params = [
        {"criterion": "entropy", "max_depth": 20, "n_estimators": 628},
        {"criterion": "gini", "max_depth": 21, "n_estimators": 554},
        {"criterion": "gini", "max_depth": 16, "n_estimators": 788},
        {"criterion": "gini", "max_depth": 26, "n_estimators": 589},
        {"criterion": "gini", "max_depth": 25, "n_estimators": 775},
        {"criterion": "gini", "max_depth": 54, "n_estimators": 276},
        {"criterion": "entropy", "max_depth": 21, "n_estimators": 998},
    ]

    columns = st.columns(2)
    for i, target in enumerate(target_labels):
        col = columns[i % 2]
        col.header(target)
        tab_result, tab_explain = col.tabs(["RESULTS", "EXPLANATIONS"])

        # Train model
        _, target_values = preprocess_data(data, target_labels)
        model, X_test, y_test, y_pred = train_model(features, data[target], rf_params[i])

        # Display results
        display_results(tab_result, y_test, y_pred, model, features.columns)
        display_feature_importance(tab_explain, model, features.columns)