# Create a SimpleImputer object to impute missing values with the mean
imputer = SimpleImputer(strategy='median') # mean

# Impute missing values in the 'age' and 'blood_pressure' columns
penguins_df[['bill_length_mm', 'bill_depth_mm','flipper_length_mm','body_mass_g']] = imputer.fit_transform(penguins_df[['bill_length_mm', 'bill_depth_mm','flipper_length_mm','body_mass_g']])

# Check for missing values after imputation
print("Number of NA's After\n",penguins_df.isnull().sum())


# Create a SimpleImputer object to impute missing values with the most frequent value
imputer2 = SimpleImputer(strategy='most_frequent')
# Impute missing values in the 'sex' column
penguins_df['sex'] = imputer2.fit_transform(penguins_df[['sex']])
# Check for missing values after imputation
print("Number of NA's After\n", penguins_df.isnull().sum())


## Z-score method outlier removal
from sklearn.datasets import load_diabetes
from scipy import stats
import numpy as np

# load the dataset
diabetes = load_diabetes()
# detect the outliers using z-score method
z_scores = stats.zscore(diabetes.data)
abs_z_scores = np.abs(z_scores)
filtered_enteries = (abs_z_scores < 3).all(axis=1)

num_outliers = diabetes.target.shape[0]- sum(filtered_enteries)

diabetes.data = diabetes.data[filtered_enteries]
diabetes.target = diabetes.target[filtered_enteries]

print("Number of outliers filtered:", num_outliers)
print("Size of cleaned dataset:",diabetes.target.shape)


from sklearn.datasets import load_diabetes
from scipy import stats
import numpy as np

diabetes = load_diabetes()

db_df = pd.DataFrame(diabetes.data,
                     columns=diabetes.feature_names)
db_df.head()

db_df.s1.plot(kind='box');


# Z-score outlier Removal 
diabetes = load_diabetes()
# Detect the outliers using z-score method
z_scores = stats.zscore(diabetes.data)
abs_z_scores = np.abs(z_scores)
filtered_enteries = (abs_z_scores < 3).all(axis=1)
num_outliers = diabetes.target.shape[0] - sum(filtered_enteries)
diabetes.data = diabetes.data[filtered_enteries]
diabetes.target = diabetes.target[filtered_enteries]


print("number of outliers filtered:", num_outliers)
print("Size of cleaned dataset:", diabetes.target.shape)


The Z-score method involves computing the standard score (Z-score) for each feature in the dataset. The Z-score represents the number of standard deviations that a data point is from the mean. Any data points with a Z-score greater than a specified threshold (typically 3 or 4) are considered outliers and can be removed from the dataset.


On the other hand, IsolationForest is a machine learning algorithm that works by identifying anomalies in the data, which are considered to be outliers. IsolationForest works by randomly selecting features and then randomly selecting split points between the maximum and minimum values of each selected feature. This process is repeated recursively until each data point is isolated into its own leaf node. Data points that require few splits to be isolated are considered to be outliers:

# Isolatin Forest outliers removal
heart_array = heart_data.values # extract the arrays from df
isolation_forest = IsolationForest(n_estimators=100, contamination=0.01)
isolation_forest.fit(heart_array)
outlier_indexes = isolation_forest.predict(heart_array)== -1
heart_data_no_outliers = heart_data[~outlier_indexes]
# Print the number of outliers detected and the size of the cleaned dataset
print("Number of outliers detected:", sum(outlier_indexes))
print("Size of cleaned dataset:", heart_data_no_outliers.shape)


# normalization and standardazation

import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the diabetes dataset from sklearn
diabetes_data = load_diabetes()
diabetes_df = pd.DataFrame(diabetes_data['data'], columns=diabetes_data['feature_names'])

# Perform z-score normalization on the 'age' column
z_scaler = StandardScaler()
diabetes_df['age_z'] = z_scaler.fit_transform(diabetes_df[['age']])

# Perform min-max scaling on the 'bmi' column
mm_scaler = MinMaxScaler()
diabetes_df['bmi_mm'] = mm_scaler.fit_transform(diabetes_df[['bmi']])

# Print the normalized and scaled data
print(diabetes_df[['age', 'age_z', 'bmi', 'bmi_mm']].head())


import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the diabetes dataset from sklearn
diabetes_data = load_diabetes()
diabetes_df = pd.DataFrame(diabetes_data['data'], columns=diabetes_data['feature_names'])

# Perform z-score normalization on the 'age' column
z_scaler = StandardScaler()
diabetes_df['age_z'] = z_scaler.fit_transform(diabetes_df[['age']])

# Perform min-max scaling on the 'bmi' column with range of -1,1
mm_scaler = MinMaxScaler(feature_range=(-1,1))
diabetes_df['bmi_mm'] = mm_scaler.fit_transform(diabetes_df[['bmi']])

# Print the normalized and scaled data
print(diabetes_df[['age', 'age_z', 'bmi', 'bmi_mm']].head())


#Label econder

import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
from scipy.sparse import SparseEfficiencyWarning
warnings.simplefilter('ignore', SparseEfficiencyWarning)

penguins_df = sns.load_dataset('penguins')
penguins_df.head()

le = LabelEncoder()
penguins_df['sex_encod'] = le.fit_transform(penguins_df.sex)
penguins_df.head()


import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
from scipy.sparse import SparseEfficiencyWarning
warnings.simplefilter('ignore', SparseEfficiencyWarning)

# Load the penguins dataset from Seaborn
penguins_df = sns.load_dataset('penguins')

# Perform label encoding on the 'sex' column
label_encoder = LabelEncoder()
penguins_df['sex_encoded'] = label_encoder.fit_transform(penguins_df['sex'])

# Perform one-hot encoding on the 'island' column
one_hot_encoder = OneHotEncoder()
island_encoded = one_hot_encoder.fit_transform(penguins_df[['island']])
island_encoded_df = pd.DataFrame(island_encoded.toarray(), columns=[f'island {i}' for i in range(island_encoded.shape[1])])
penguins_df = pd.concat([penguins_df, island_encoded_df], axis=1)

# Print the encoded data
print(penguins_df[['sex', 'sex_encoded', 'island'] + [f'island {i}' for i in range(island_encoded.shape[1])]].head())


from sklearn.model_selection import train_test_split
df = pd.read_csv('data/heart_disease.csv')
df.head()


X = df.drop('target',axis=1)
y = df.target
# train and test split
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    test_size=0.2,
                                                    random_state=42)


from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, X, y, cv=10)
print(scores)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=9000)
lr.fit(X_train, Y_train) #Training phase
Y_pred=lr.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, Y_pred)
print('accuracy',accuracy)


 from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter= 9000)
lr.fit(X_train, y_train) # training phase
y_pred = lr.predict(X_test)

from sklearn.metrics import accuracy_score
accuray = accuracy_score(y_test, y_pred)
print('accuray', accuray)

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import f1_score, confusion_matrix 
from sklearn.metrics import classification_report,recall_score

# load the datast
breast = load_breast_cancer()
# split the dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(breast.data, 
                                                    breast.target,
                                                    train_size=0.8,
                                                   random_state=42)

# create a Logistic Regression model
model = LogisticRegression(max_iter=9000)

# Train and the model on training dataset
model.fit(X_train, y_train)
# make predictions on the test dataset
y_pred = model.predict(X_test)

# calaculate the accuraty of the out model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

breast = load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(breast.data,
                                                breast.target,
                                                test_size=0.2,
                                                random_state=42)
# Train LR model
model = LogisticRegression(max_iter=9000)
model.fit(X_train,y_train)
# Test LR model
y_pred = model.predict(X_test)
# accuracy
accuracy = accuracy_score(y_pred, y_test)
print("Accuracy", accuracy)
# precision
precision = precision_score(y_pred,y_test)
print("Precision", precision)
# Recall
recall = recall_score(y_test, y_pred)
print("Recall", recall)
# F1-score
f1 = f1_score(y_test, y_pred)
print('f1',f1)

# Confusion matrix
cm = confusion_matrix(y_pred, y_test)
print("confusion matrix:")
print(cm)

    
    # calculate and print the classification report of the model
report = classification_report(y_test, y_pred)
print(report)

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_curve, auc
import matplotlib.pyplot as plt

# Create a list of classification models to evaluate
models = [LogisticRegression(max_iter=9000), DecisionTreeClassifier(), RandomForestClassifier(), SVC()]

# Train and evaluate each model
for model in models:
    # Train the model on the training set
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Calculate and print the classification report of the model
    report = classification_report(y_test, y_pred)
    print(model.__class__.__name__)
    print(report)
    print('-----------------------------------')
    

from sklearn.metrics import classification_report, roc_curve, auc
import matplotlib.pyplot as plt
# Train and evaluate each model
for model in models:
    # Train the model on the training set
    model.fit(X_train, y_train)
    report = classification_report(y_test, y_pred)
    print(model.__class__.__name__)
    print(report)
    y_score = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='{} (AUC = {:.4f})'.format(model.__class__.__name__, roc_auc))

    print('-----------------------------------')


from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

models = [ LogisticRegression(max_iter=9000),
          DecisionTreeClassifier(),
          RandomForestClassifier(),
          SVC(probability=True)      
         ]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(model.__class__.__name__)
    print(report)
    y_score = model.predict_proba(X_test)[:,1]
    fpr, tpr,_ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='{} (AUC= {:.2f})'.format(
        model.__class__.__name__, roc_auc))
    print('-------------------------------------')
plt.plot([0,1],[0,1],'k--', label='Random Guessing')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC AUC Curve')
plt.legend(loc='lower right')
plt.show()


# learning curve
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import learning_curve

# Load the dataset
data = load_breast_cancer()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Create a list of classification models to evaluate
models = [LogisticRegression(max_iter=8000), DecisionTreeClassifier(), RandomForestClassifier(), SVC(probability=True)]

# Plot the learning curves for all models in separate subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
axes = axes.flatten()

for model, ax in zip(models, axes):
    # Calculate learning curve
    train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

    # Calculate mean and standard deviation of training and testing scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot learning curve
    ax.plot(train_sizes, train_mean, label='Training Score')
    ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2)
    ax.plot(train_sizes, test_mean, label='Validation Score')
    ax.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2)

    ax.set_title('{} Learning Curve'.format(model.__class__.__name__))
    ax.set_xlabel('Training examples')
    ax.set_ylabel('Score')
    ax.legend(loc='best')

plt.tight_layout()
plt.show()


#imbalanced data
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.pipeline import make_pipeline as make_imb_pipline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import geometric_mean_score

X,y = make_classification(n_classes=2, class_sep=2,
                          weights=[0.9,0.1], n_informative=3,
                          n_redundant=1, flip_y=0, n_features=20,
                          n_clusters_per_class=1, n_samples=1000,
                          random_state=42)


# define the models
models = [make_imb_pipline(RandomOverSampler(), 
                           RandomUnderSampler(),
                           StandardScaler(), SVC()),
          make_pipeline(StandardScaler(),
                        RandomForestClassifier()),
          make_pipeline(StandardScaler(),
                        GradientBoostingClassifier())
            ]
scoring_metric = 'f1_macro'
for model in models:
    scores = cross_val_score(model, X,y, scoring=scoring_metric,
                             cv=5)
    print(f"{type(model[-1]).__name__} model performance:")
    print(f"Mean {scoring_metric} score:{scores.mean()}")
    print(f"Standard deviation of {scoring_metric} scores: {scores.std()}\n")

# option2 for imbalaced data

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.pipeline import make_pipeline as make_imb_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import geometric_mean_score

# Generate imbalanced classification data
X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.9, 0.1], n_informative=3,
                           n_redundant=1, flip_y=0, n_features=20,
                           n_clusters_per_class=1, n_samples=1000,
                           random_state=42)

# Define the models
models = [
    make_imb_pipeline(RandomOverSampler(), RandomUnderSampler(),
                      StandardScaler(), SVC()),
    make_imb_pipeline(RandomOverSampler(), RandomUnderSampler(),
                      StandardScaler(), RandomForestClassifier()),
    make_imb_pipeline(RandomOverSampler(), RandomUnderSampler(),
                      StandardScaler(), GradientBoostingClassifier())
]

# Define the evaluation metric
scoring_metric = 'f1_macro'

# Evaluate each model using cross-validation
for model in models:
    scores = cross_val_score(model, X, y, scoring=scoring_metric, cv=5)
    print(f"{type(model[-1]).__name__} model performance:")  # get the name of the last estimator in the pipeline
    print(f"Mean {scoring_metric} score: {scores.mean()}")
    print(f"Standard deviation of {scoring_metric} scores: {scores.std()}\n")


## Regression models
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
diabetes = load_diabetes()

X = diabetes.data
y = diabetes.target


imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)


X_train, X_test, y_train, y_test=train_test_split(X,y,
                                                test_size=0.2,
                                                random_state=0)
models = [
    LinearRegression(), Lasso(alpha=0.1), Ridge(0.1),
    RandomForestRegressor(n_estimators=100), SVR()
]
for model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_pred, y_test)
    print(f'{model.__class__.__name__} MSE: {mse:.4f}')
    
    
    ## after target normalazation
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Load the diabetes dataset
diabetes = load_diabetes()

# Prepare the data
X = diabetes.data
y = diabetes.target

# Impute missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)

# Normalize the features
scaler_X = StandardScaler()
X = scaler_X.fit_transform(X)

# Normalize the target variable
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define regression models
models = [
    LinearRegression(), Lasso(alpha=0.1), Ridge(alpha=0.1),
    RandomForestRegressor(n_estimators=100), SVR()
]

# Train and evaluate each model
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    # Inverse transform predictions to get them back to the original scale
    y_pred_original_scale = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).flatten()
    mse_original_scale = mean_squared_error(scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten(), y_pred_original_scale)
    
    print(f'{model.__class__.__name__} MSE (Normalized): {mse:.4f}')
    print(f'{model.__class__.__name__} MSE (Original Scale): {mse_original_scale:.4f}\n')


	1. Linear Regression: This is a basic linear regression model that tries to fit a linear relationship between the input features and the target variable.
2. Lasso: This is a type of linear regression model that adds an L1 regularization term to the cost function to reduce overfitting. The regularization term penalizes the model for having too many non-zero coefficients and can help with feature selection.
3. Ridge: This is another type of linear regression model that adds an L2 regularization term to the cost function to reduce overfitting. The regularization term penalizes the model for having large coefficients and can help with feature selection.


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
## loading the datasets
diabetes = load_diabetes()
# split the train and test prop.
X = diabetes.data
y = diabetes.target
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2,
                                                  random_state=42)
models = [make_pipeline(StandardScaler(), GradientBoostingRegressor()),
          make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100)),
          make_pipeline(StandardScaler(), XGBRegressor(n_estimators=100))]

for model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(type(model).__name__)
    print("R2 score:", r2_score(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test,y_pred))
    

import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target
model = RandomForestClassifier(n_estimators=50)
cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X,y,scoring='accuracy', cv=cv, n_jobs=1)
print('Mean Accuracy : %.3f  (%.3f)' % (np.mean(n_scores),np.std(n_scores)))


## Deep learning
# ANN
import tensorflow as tf

x_data = [1.0, 2.0, 3.0]
y_data = [2.0, 4.0, 6.0]

w = tf.Variable(1.0, trainable=True, dtype=tf.float32)  # Any random value

# our model forward pass
def forward(x):
    return x * w

# Loss function
def loss(x, y):
    y_pred = forward(x)
    return tf.square(y_pred - y)

# Before training
print("predict (before training)", 4, forward(4).numpy())

# Training
learning_rate = 0.01
optimizer = tf.optimizers.SGD(learning_rate)

for epoch in range(100):
    for x_val, y_val in zip(x_data, y_data):
        with tf.GradientTape() as tape:
            l = loss(x_val, y_val)
        gradients = tape.gradient(l, [w])
        optimizer.apply_gradients(zip(gradients, [w]))

# After training
print("predict (after training)", 4, forward(4).numpy())


import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow import keras

df = load_breast_cancer()
X = df.data
y = df.target
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train,X_test, y_train, y_test = train_test_split(X,y, 
                                                   test_size=0.2, 
                                                   random_state=0)
# define ANN
model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))
# compile model
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuray'])

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow import keras

data  = load_breast_cancer()
X = data.data
y = data.target

# Standrdize the features

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                   random_state=42)
y_train, y_test = to_categorical(y_train),to_categorical(y_test)

# Define an artificial neural network model

model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1],activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))

# compile model

model.compile(loss='categorical_crossentropy',
              optimizer='adam',metrics=['accuracy'])


print(model.summary())

model.fit(X_train,y_train, epochs=50,
          batch_size = 32,
          validation_data=(X_test,y_test))

y_pred_cat = np.argmax(y_pred, axis=1)
y_pred_cat


y_test_cat = np.argmax(y_test, axis=1)
y_test_cat


print(classification_report(y_test_cat,
                            y_pred_cat,target_names=data.target_names))
                       

history = model.fit(X_train,y_train, epochs=50,
          batch_size = 32,
          validation_data=(X_test,y_test))

import matplotlib.pyplot as plt
train_loss = history.history['loss']
val_loss = history.history['val_loss']
epoch = range(1, len(train_loss)+1)
plt.plot(epoch, train_loss, label='Training Loss')
plt.plot(epoch, val_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()     
                            
                            
# Visualize the clustering results
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis',
            edgecolors='k', s=50)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red',
            marker='X', s=200, label='Centroids')
plt.title('K-Means Clustering of Diabetes Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
iris = load_iris()
X,y = iris.data, iris.target
pca = PCA(n_components=3)
X_iris = pca.fit_transform(X)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X)
fig, axis = plt.subplots(1, 2, figsize=(8, 5))
sns.scatterplot(x=X_iris[:, 0], y=X_iris[:,1], hue=y, ax=axis[0])
sns.scatterplot(x=X_tsne[:,0], y= X_tsne[:,1], hue=y, ax=axis[1])
axis[0].set_title("PCA")
axis[1].set_title("t-SNE")
plt.show()


%writefile apps.py
# step 1: loading libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 2 define how to load the dataset
def load_data():
    data = pd.read_csv('data/heart_disease.csv')
    return data
# step 3: building a machine learning model or DL model
heart_data = load_data()
X = heart_data.drop('target', axis=1)
y = heart_data['target']

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.3, 
                                                  random_state=42)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)

st.title("Heart Disease Prediction")

st.title("Heart Disease Prediction")

age = st.slider('Age',0,100,25)
sex = st.selectbox('Sex',['Male','Female'])
cp = st.selectbox("Chest pain type",[0,1,2,3])
trestbps = st.slider('Resting Blood Pressure(mmHg)', 0, 200, 120)
chol = st.slider("Serum Cholesterol (mm/dl)",0, 600, 200)
fbs = st.selectbox('Fasting Blood Suger > 120 mg//dl', [0,1])
restecg = st.selectbox("Resting Electrocardiogrphic Result", [0,1,2])
thalach = st.slider("Maximum Heart rate achieved", 0, 300, 150)
exang = st.selectbox("Exercise induced angina",[ 0, 1])
oldpeak = st.slider("ST Depression induced by execise relative to rest",
                   0.0, 6.2,3.1,0.1)
slope = st.selectbox("Slope of the peak exercise ST segment", [0, 1,2])
ca = st.selectbox("Number, of major vessels colored by flouroscopy",
                 [0, 1, 2,3])
thal = st.selectbox("Thalassemia", [0,1,2,3])

input_data = {
    'age': age,
    'sex': 1 if sex== 'Male' else 0,
    'cp' : cp,
    'trestbps': trestbps,
    'chol': chol,
    'fbs': fbs,
    'restecg': restecg,
    'thalach':thalach,
    'exang': exang,
    'oldpeak': oldpeak,
    'slope': slope,
    'ca': ca,
    'thal': thal
    
}
new_data = np.array(list(input_data.values())).reshape(1,-1)

st.subheader("Feature Importance")
feat_importance = pd.Series(rfc.feature_importances_, index=X.columns)
feat_importance = feat_importance.sort_values(ascending=False)
st.bar_chart(feat_importance)

st.subheader("Exploratory Data Analysis")
if st.checkbox('Show Data Summary'):
    st.write(heart_data.describe())
    
if st.checkbox('Show Data'):
    st.write(heart_data)
    
if st.checkbox("Show Correlation Heatmap"):
    corr_matrix = heart_data.corr()
    fig,ax = plt.subplots(figsize=(12,12))
    sns.heatmap(corr_matrix,annot=True, cmap='coolwarm')
    st.pyplot(fig)
    
if st.button("Predict"):
    ypred = rfc.predict(new_data)
    if ypred[0] == 1:
        st.write("Prediction Result : Has Heart Disease")
    else:
        st.write("Prediction Result: No Heart Disease")

# Define sidebar content
st.sidebar.title("EPHI")
#st.sidebar.image('logo.jpeg', width=150)
st.sidebar.write("Made by Mesfin Diro")
st.sidebar.write("Addis Ababa University")
st.sidebar.write("Computational Data Science")


%%writefile apps.py
# step 1: loading libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 2 define how to load the dataset
def load_data():
    data = pd.read_csv('data/heart_disease.csv')
    return data
# step 3: building a machine learning model or DL model
heart_data = load_data()
X = heart_data.drop('target', axis=1)
y = heart_data['target']

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.3, 
                                                  random_state=42)
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)

st.title("Heart Disease Prediction")

age = st.slider('Age', 0, 100, 25)
sex = st.selectbox('Sex', ['Male', 'Female'])
cp = st.selectbox("Chest Pain Type", [0, 1, 2, 3])
trestbps = st.slider("Resting Blood Pressure(mmHg)", 0, 200, 120)
chol = st.slider("Serum Cholesterol (mg/dl)", 0, 600, 200)
fbs = st.selectbox("Fasting Blood Suger > 120 mg/dl", [0,1])
restecg = st.selectbox("Resting Electrocardiographic Results", [0,1,2])
thalach = st.slider("Maximum Heart Rate Achieved", 0,300,150)
exang = st.selectbox("Exercise Induction Angina", [0,1])
oldpeak = st.slider("ST depression induced", 0.0, 6.2,3.1,0.1)
slope = st.selectbox("Slope of the peak Ex. ST Segment", [0,1,2])
ca = st.selectbox("major vessels color by Flouroscopy",[0,1,2,3])
thal = st.selectbox("Thalassemia", [0,1,2,3])

input_data = { 'age': age,
               'sex': 1 if sex=='Male' else 0,
               'cp': cp,
               'trestbps': trestbps,
               'chol': chol,
               'fbs': fbs,
               'restecg': restecg,
               'thalach': thalach,
               'exang': exang,
               'oldpeak': oldpeak,
               'slope': slope, 
               'ca': ca,
               'thal': thal
              
             }
new_data = np.array(list(input_data.values())).reshape(1,-1)
prediction = rfc.predict(new_data)
st.write('Prediction', prediction[0])
# define slidebar content
st.sidebar.title("EPHI")
st.sidebar.write("Developed by collaburation effort team")
st.sidebar.write("NDMC Department")
about_part= st.sidebar.expander("EPHI", expanded=False)
with about_part:
    st.write('''
    ### About
    NDMC is blalala
    ### Thanks to ...
    - [Anwar Taju](https;//mesfind.github.io)
    ''')
st.subheader("Explanatory Analysis")
if st.checkbox("Show Data Summary"):
    st.write(heart_data.describe())

if st.checkbox("Show data"):
    st.write(heart_data)
if st.checkbox('Show correlation Heatmap'):
    corr_matrix = heart_data.corr()
    fig,ax = plt.subplots(figsize=(12,12))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    st.pyplot(fig)
if st.checkbox("Feature Importance"):
    feat_importance = pd.Series(rfc.feature_importances_, index=X.columns)
    feat_importance = feat_importance.sort_values(ascending=False)
    st.bar_chart(feat_importance)