# Create a SimpleImputer object to impute missing values with the mean imputer = SimpleImputer(strategy='median') # mean # Impute missing values in the 'age' and 'blood_pressure' columns penguins_df[['bill_length_mm', 'bill_depth_mm','flipper_length_mm','body_mass_g']] = imputer.fit_transform(penguins_df[['bill_length_mm', 'bill_depth_mm','flipper_length_mm','body_mass_g']]) # Check for missing values after imputation print("Number of NA's After\n",penguins_df.isnull().sum()) # Create a SimpleImputer object to impute missing values with the most frequent value imputer2 = SimpleImputer(strategy='most_frequent') # Impute missing values in the 'sex' column penguins_df['sex'] = imputer2.fit_transform(penguins_df[['sex']]) # Check for missing values after imputation print("Number of NA's After\n", penguins_df.isnull().sum()) ## Z-score method outlier removal from sklearn.datasets import load_diabetes from scipy import stats import numpy as np # load the dataset diabetes = load_diabetes() # detect the outliers using z-score method z_scores = stats.zscore(diabetes.data) abs_z_scores = np.abs(z_scores) filtered_enteries = (abs_z_scores < 3).all(axis=1) num_outliers = diabetes.target.shape[0]- sum(filtered_enteries) diabetes.data = diabetes.data[filtered_enteries] diabetes.target = diabetes.target[filtered_enteries] print("Number of outliers filtered:", num_outliers) print("Size of cleaned dataset:",diabetes.target.shape) from sklearn.datasets import load_diabetes from scipy import stats import numpy as np diabetes = load_diabetes() db_df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) db_df.head() db_df.s1.plot(kind='box'); # Z-score outlier Removal diabetes = load_diabetes() # Detect the outliers using z-score method z_scores = stats.zscore(diabetes.data) abs_z_scores = np.abs(z_scores) filtered_enteries = (abs_z_scores < 3).all(axis=1) num_outliers = diabetes.target.shape[0] - sum(filtered_enteries) diabetes.data = diabetes.data[filtered_enteries] diabetes.target = diabetes.target[filtered_enteries] print("number of outliers filtered:", num_outliers) print("Size of cleaned dataset:", diabetes.target.shape) The Z-score method involves computing the standard score (Z-score) for each feature in the dataset. The Z-score represents the number of standard deviations that a data point is from the mean. Any data points with a Z-score greater than a specified threshold (typically 3 or 4) are considered outliers and can be removed from the dataset. On the other hand, IsolationForest is a machine learning algorithm that works by identifying anomalies in the data, which are considered to be outliers. IsolationForest works by randomly selecting features and then randomly selecting split points between the maximum and minimum values of each selected feature. This process is repeated recursively until each data point is isolated into its own leaf node. Data points that require few splits to be isolated are considered to be outliers: # Isolatin Forest outliers removal heart_array = heart_data.values # extract the arrays from df isolation_forest = IsolationForest(n_estimators=100, contamination=0.01) isolation_forest.fit(heart_array) outlier_indexes = isolation_forest.predict(heart_array)== -1 heart_data_no_outliers = heart_data[~outlier_indexes] # Print the number of outliers detected and the size of the cleaned dataset print("Number of outliers detected:", sum(outlier_indexes)) print("Size of cleaned dataset:", heart_data_no_outliers.shape) # normalization and standardazation import pandas as pd from sklearn.datasets import load_diabetes from sklearn.preprocessing import StandardScaler, MinMaxScaler # Load the diabetes dataset from sklearn diabetes_data = load_diabetes() diabetes_df = pd.DataFrame(diabetes_data['data'], columns=diabetes_data['feature_names']) # Perform z-score normalization on the 'age' column z_scaler = StandardScaler() diabetes_df['age_z'] = z_scaler.fit_transform(diabetes_df[['age']]) # Perform min-max scaling on the 'bmi' column mm_scaler = MinMaxScaler() diabetes_df['bmi_mm'] = mm_scaler.fit_transform(diabetes_df[['bmi']]) # Print the normalized and scaled data print(diabetes_df[['age', 'age_z', 'bmi', 'bmi_mm']].head()) import pandas as pd from sklearn.datasets import load_diabetes from sklearn.preprocessing import StandardScaler, MinMaxScaler # Load the diabetes dataset from sklearn diabetes_data = load_diabetes() diabetes_df = pd.DataFrame(diabetes_data['data'], columns=diabetes_data['feature_names']) # Perform z-score normalization on the 'age' column z_scaler = StandardScaler() diabetes_df['age_z'] = z_scaler.fit_transform(diabetes_df[['age']]) # Perform min-max scaling on the 'bmi' column with range of -1,1 mm_scaler = MinMaxScaler(feature_range=(-1,1)) diabetes_df['bmi_mm'] = mm_scaler.fit_transform(diabetes_df[['bmi']]) # Print the normalized and scaled data print(diabetes_df[['age', 'age_z', 'bmi', 'bmi_mm']].head()) #Label econder import seaborn as sns from sklearn.preprocessing import LabelEncoder, OneHotEncoder import warnings from scipy.sparse import SparseEfficiencyWarning warnings.simplefilter('ignore', SparseEfficiencyWarning) penguins_df = sns.load_dataset('penguins') penguins_df.head() le = LabelEncoder() penguins_df['sex_encod'] = le.fit_transform(penguins_df.sex) penguins_df.head() import pandas as pd import seaborn as sns from sklearn.preprocessing import LabelEncoder, OneHotEncoder import warnings from scipy.sparse import SparseEfficiencyWarning warnings.simplefilter('ignore', SparseEfficiencyWarning) # Load the penguins dataset from Seaborn penguins_df = sns.load_dataset('penguins') # Perform label encoding on the 'sex' column label_encoder = LabelEncoder() penguins_df['sex_encoded'] = label_encoder.fit_transform(penguins_df['sex']) # Perform one-hot encoding on the 'island' column one_hot_encoder = OneHotEncoder() island_encoded = one_hot_encoder.fit_transform(penguins_df[['island']]) island_encoded_df = pd.DataFrame(island_encoded.toarray(), columns=[f'island {i}' for i in range(island_encoded.shape[1])]) penguins_df = pd.concat([penguins_df, island_encoded_df], axis=1) # Print the encoded data print(penguins_df[['sex', 'sex_encoded', 'island'] + [f'island {i}' for i in range(island_encoded.shape[1])]].head()) from sklearn.model_selection import train_test_split df = pd.read_csv('data/heart_disease.csv') df.head() X = df.drop('target',axis=1) y = df.target # train and test split X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42) from sklearn.model_selection import cross_val_score from sklearn.tree import DecisionTreeClassifier dt = DecisionTreeClassifier() scores = cross_val_score(dt, X, y, cv=10) print(scores) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(max_iter=9000) lr.fit(X_train, Y_train) #Training phase Y_pred=lr.predict(X_test) from sklearn.metrics import accuracy_score accuracy = accuracy_score(Y_test, Y_pred) print('accuracy',accuracy) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(max_iter= 9000) lr.fit(X_train, y_train) # training phase y_pred = lr.predict(X_test) from sklearn.metrics import accuracy_score accuray = accuracy_score(y_test, y_pred) print('accuray', accuray) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score from sklearn.metrics import f1_score, confusion_matrix from sklearn.metrics import classification_report,recall_score # load the datast breast = load_breast_cancer() # split the dataset into training and test sets X_train, X_test, y_train, y_test = train_test_split(breast.data, breast.target, train_size=0.8, random_state=42) # create a Logistic Regression model model = LogisticRegression(max_iter=9000) # Train and the model on training dataset model.fit(X_train, y_train) # make predictions on the test dataset y_pred = model.predict(X_test) # calaculate the accuraty of the out model accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score from sklearn.metrics import recall_score, f1_score from sklearn.metrics import confusion_matrix, classification_report breast = load_breast_cancer() X_train,X_test,y_train,y_test=train_test_split(breast.data, breast.target, test_size=0.2, random_state=42) # Train LR model model = LogisticRegression(max_iter=9000) model.fit(X_train,y_train) # Test LR model y_pred = model.predict(X_test) # accuracy accuracy = accuracy_score(y_pred, y_test) print("Accuracy", accuracy) # precision precision = precision_score(y_pred,y_test) print("Precision", precision) # Recall recall = recall_score(y_test, y_pred) print("Recall", recall) # F1-score f1 = f1_score(y_test, y_pred) print('f1',f1) # Confusion matrix cm = confusion_matrix(y_pred, y_test) print("confusion matrix:") print(cm) # calculate and print the classification report of the model report = classification_report(y_test, y_pred) print(report) from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import classification_report, roc_curve, auc import matplotlib.pyplot as plt # Create a list of classification models to evaluate models = [LogisticRegression(max_iter=9000), DecisionTreeClassifier(), RandomForestClassifier(), SVC()] # Train and evaluate each model for model in models: # Train the model on the training set model.fit(X_train, y_train) # Make predictions on the testing set y_pred = model.predict(X_test) # Calculate and print the classification report of the model report = classification_report(y_test, y_pred) print(model.__class__.__name__) print(report) print('-----------------------------------') from sklearn.metrics import classification_report, roc_curve, auc import matplotlib.pyplot as plt # Train and evaluate each model for model in models: # Train the model on the training set model.fit(X_train, y_train) report = classification_report(y_test, y_pred) print(model.__class__.__name__) print(report) y_score = model.predict_proba(X_test)[:,1] fpr, tpr, _ = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='{} (AUC = {:.4f})'.format(model.__class__.__name__, roc_auc)) print('-----------------------------------') from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt models = [ LogisticRegression(max_iter=9000), DecisionTreeClassifier(), RandomForestClassifier(), SVC(probability=True) ] for model in models: model.fit(X_train, y_train) y_pred = model.predict(X_test) report = classification_report(y_test, y_pred) print(model.__class__.__name__) print(report) y_score = model.predict_proba(X_test)[:,1] fpr, tpr,_ = roc_curve(y_test, y_score) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='{} (AUC= {:.2f})'.format( model.__class__.__name__, roc_auc)) print('-------------------------------------') plt.plot([0,1],[0,1],'k--', label='Random Guessing') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC AUC Curve') plt.legend(loc='lower right') plt.show() # learning curve import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.model_selection import learning_curve # Load the dataset data = load_breast_cancer() # Split the dataset into training and testing sets X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42) # Create a list of classification models to evaluate models = [LogisticRegression(max_iter=8000), DecisionTreeClassifier(), RandomForestClassifier(), SVC(probability=True)] # Plot the learning curves for all models in separate subplots fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10)) axes = axes.flatten() for model, ax in zip(models, axes): # Calculate learning curve train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1) # Calculate mean and standard deviation of training and testing scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot learning curve ax.plot(train_sizes, train_mean, label='Training Score') ax.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.2) ax.plot(train_sizes, test_mean, label='Validation Score') ax.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.2) ax.set_title('{} Learning Curve'.format(model.__class__.__name__)) ax.set_xlabel('Training examples') ax.set_ylabel('Score') ax.legend(loc='best') plt.tight_layout() plt.show() #imbalanced data from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from imblearn.pipeline import make_pipeline as make_imb_pipline from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.metrics import geometric_mean_score X,y = make_classification(n_classes=2, class_sep=2, weights=[0.9,0.1], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=42) # define the models models = [make_imb_pipline(RandomOverSampler(), RandomUnderSampler(), StandardScaler(), SVC()), make_pipeline(StandardScaler(), RandomForestClassifier()), make_pipeline(StandardScaler(), GradientBoostingClassifier()) ] scoring_metric = 'f1_macro' for model in models: scores = cross_val_score(model, X,y, scoring=scoring_metric, cv=5) print(f"{type(model[-1]).__name__} model performance:") print(f"Mean {scoring_metric} score:{scores.mean()}") print(f"Standard deviation of {scoring_metric} scores: {scores.std()}\n") # option2 for imbalaced data from sklearn.datasets import make_classification from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from imblearn.pipeline import make_pipeline as make_imb_pipeline from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from imblearn.metrics import geometric_mean_score # Generate imbalanced classification data X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=42) # Define the models models = [ make_imb_pipeline(RandomOverSampler(), RandomUnderSampler(), StandardScaler(), SVC()), make_imb_pipeline(RandomOverSampler(), RandomUnderSampler(), StandardScaler(), RandomForestClassifier()), make_imb_pipeline(RandomOverSampler(), RandomUnderSampler(), StandardScaler(), GradientBoostingClassifier()) ] # Define the evaluation metric scoring_metric = 'f1_macro' # Evaluate each model using cross-validation for model in models: scores = cross_val_score(model, X, y, scoring=scoring_metric, cv=5) print(f"{type(model[-1]).__name__} model performance:") # get the name of the last estimator in the pipeline print(f"Mean {scoring_metric} score: {scores.mean()}") print(f"Standard deviation of {scoring_metric} scores: {scores.std()}\n") ## Regression models import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.datasets import load_diabetes from sklearn.linear_model import LinearRegression, Lasso, Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.metrics import mean_squared_error diabetes = load_diabetes() X = diabetes.data y = diabetes.target imputer = SimpleImputer(missing_values=np.nan, strategy='median') X = imputer.fit_transform(X) X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=0) models = [ LinearRegression(), Lasso(alpha=0.1), Ridge(0.1), RandomForestRegressor(n_estimators=100), SVR() ] for model in models: model.fit(X_train,y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_pred, y_test) print(f'{model.__class__.__name__} MSE: {mse:.4f}') ## after target normalazation import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.datasets import load_diabetes from sklearn.linear_model import LinearRegression, Lasso, Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.metrics import mean_squared_error # Load the diabetes dataset diabetes = load_diabetes() # Prepare the data X = diabetes.data y = diabetes.target # Impute missing values imputer = SimpleImputer(missing_values=np.nan, strategy='median') X = imputer.fit_transform(X) # Normalize the features scaler_X = StandardScaler() X = scaler_X.fit_transform(X) # Normalize the target variable scaler_y = StandardScaler() y = scaler_y.fit_transform(y.reshape(-1, 1)).flatten() # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Define regression models models = [ LinearRegression(), Lasso(alpha=0.1), Ridge(alpha=0.1), RandomForestRegressor(n_estimators=100), SVR() ] # Train and evaluate each model for model in models: model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) # Inverse transform predictions to get them back to the original scale y_pred_original_scale = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).flatten() mse_original_scale = mean_squared_error(scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten(), y_pred_original_scale) print(f'{model.__class__.__name__} MSE (Normalized): {mse:.4f}') print(f'{model.__class__.__name__} MSE (Original Scale): {mse_original_scale:.4f}\n') 1. Linear Regression: This is a basic linear regression model that tries to fit a linear relationship between the input features and the target variable. 2. Lasso: This is a type of linear regression model that adds an L1 regularization term to the cost function to reduce overfitting. The regularization term penalizes the model for having too many non-zero coefficients and can help with feature selection. 3. Ridge: This is another type of linear regression model that adds an L2 regularization term to the cost function to reduce overfitting. The regularization term penalizes the model for having large coefficients and can help with feature selection. import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import load_diabetes from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score, mean_squared_error from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from xgboost import XGBRegressor ## loading the datasets diabetes = load_diabetes() # split the train and test prop. X = diabetes.data y = diabetes.target X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.2, random_state=42) models = [make_pipeline(StandardScaler(), GradientBoostingRegressor()), make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=100)), make_pipeline(StandardScaler(), XGBRegressor(n_estimators=100))] for model in models: model.fit(X_train,y_train) y_pred = model.predict(X_test) print(type(model).__name__) print("R2 score:", r2_score(y_test, y_pred)) print("MSE:", mean_squared_error(y_test,y_pred)) import numpy as np from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_breast_cancer data = load_breast_cancer() X = data.data y = data.target model = RandomForestClassifier(n_estimators=50) cv = RepeatedStratifiedKFold(n_splits=15, n_repeats=3, random_state=1) n_scores = cross_val_score(model, X,y,scoring='accuracy', cv=cv, n_jobs=1) print('Mean Accuracy : %.3f (%.3f)' % (np.mean(n_scores),np.std(n_scores))) ## Deep learning # ANN import tensorflow as tf x_data = [1.0, 2.0, 3.0] y_data = [2.0, 4.0, 6.0] w = tf.Variable(1.0, trainable=True, dtype=tf.float32) # Any random value # our model forward pass def forward(x): return x * w # Loss function def loss(x, y): y_pred = forward(x) return tf.square(y_pred - y) # Before training print("predict (before training)", 4, forward(4).numpy()) # Training learning_rate = 0.01 optimizer = tf.optimizers.SGD(learning_rate) for epoch in range(100): for x_val, y_val in zip(x_data, y_data): with tf.GradientTape() as tape: l = loss(x_val, y_val) gradients = tape.gradient(l, [w]) optimizer.apply_gradients(zip(gradients, [w])) # After training print("predict (after training)", 4, forward(4).numpy()) import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.utils import to_categorical from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, classification_report from tensorflow import keras df = load_breast_cancer() X = df.data y = df.target scaler = StandardScaler() X = scaler.fit_transform(X) X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0) # define ANN model = Sequential() model.add(Dense(16, input_dim=X_train.shape[1], activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(2, activation='softmax')) # compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuray']) import numpy as np from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.utils import to_categorical from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, classification_report from tensorflow import keras data = load_breast_cancer() X = data.data y = data.target # Standrdize the features scaler = StandardScaler() X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42) y_train, y_test = to_categorical(y_train),to_categorical(y_test) # Define an artificial neural network model model = Sequential() model.add(Dense(16, input_dim=X_train.shape[1],activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(2, activation='softmax')) # compile model model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy']) print(model.summary()) model.fit(X_train,y_train, epochs=50, batch_size = 32, validation_data=(X_test,y_test)) y_pred_cat = np.argmax(y_pred, axis=1) y_pred_cat y_test_cat = np.argmax(y_test, axis=1) y_test_cat print(classification_report(y_test_cat, y_pred_cat,target_names=data.target_names)) history = model.fit(X_train,y_train, epochs=50, batch_size = 32, validation_data=(X_test,y_test)) import matplotlib.pyplot as plt train_loss = history.history['loss'] val_loss = history.history['val_loss'] epoch = range(1, len(train_loss)+1) plt.plot(epoch, train_loss, label='Training Loss') plt.plot(epoch, val_loss, label='Validation Loss') plt.xlabel('Epoch') plt.ylabel('Loss') plt.legend() plt.show() # Visualize the clustering results plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', edgecolors='k', s=50) plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=200, label='Centroids') plt.title('K-Means Clustering of Diabetes Data') plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.legend() plt.show() import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.manifold import TSNE iris = load_iris() X,y = iris.data, iris.target pca = PCA(n_components=3) X_iris = pca.fit_transform(X) tsne = TSNE(n_components=2, perplexity=30, random_state=42) X_tsne = tsne.fit_transform(X) fig, axis = plt.subplots(1, 2, figsize=(8, 5)) sns.scatterplot(x=X_iris[:, 0], y=X_iris[:,1], hue=y, ax=axis[0]) sns.scatterplot(x=X_tsne[:,0], y= X_tsne[:,1], hue=y, ax=axis[1]) axis[0].set_title("PCA") axis[1].set_title("t-SNE") plt.show() %writefile apps.py # step 1: loading libraries import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report # 2 define how to load the dataset def load_data(): data = pd.read_csv('data/heart_disease.csv') return data # step 3: building a machine learning model or DL model heart_data = load_data() X = heart_data.drop('target', axis=1) y = heart_data['target'] X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42) rfc = RandomForestClassifier(n_estimators=100) rfc.fit(X_train,y_train) st.title("Heart Disease Prediction") st.title("Heart Disease Prediction") age = st.slider('Age',0,100,25) sex = st.selectbox('Sex',['Male','Female']) cp = st.selectbox("Chest pain type",[0,1,2,3]) trestbps = st.slider('Resting Blood Pressure(mmHg)', 0, 200, 120) chol = st.slider("Serum Cholesterol (mm/dl)",0, 600, 200) fbs = st.selectbox('Fasting Blood Suger > 120 mg//dl', [0,1]) restecg = st.selectbox("Resting Electrocardiogrphic Result", [0,1,2]) thalach = st.slider("Maximum Heart rate achieved", 0, 300, 150) exang = st.selectbox("Exercise induced angina",[ 0, 1]) oldpeak = st.slider("ST Depression induced by execise relative to rest", 0.0, 6.2,3.1,0.1) slope = st.selectbox("Slope of the peak exercise ST segment", [0, 1,2]) ca = st.selectbox("Number, of major vessels colored by flouroscopy", [0, 1, 2,3]) thal = st.selectbox("Thalassemia", [0,1,2,3]) input_data = { 'age': age, 'sex': 1 if sex== 'Male' else 0, 'cp' : cp, 'trestbps': trestbps, 'chol': chol, 'fbs': fbs, 'restecg': restecg, 'thalach':thalach, 'exang': exang, 'oldpeak': oldpeak, 'slope': slope, 'ca': ca, 'thal': thal } new_data = np.array(list(input_data.values())).reshape(1,-1) st.subheader("Feature Importance") feat_importance = pd.Series(rfc.feature_importances_, index=X.columns) feat_importance = feat_importance.sort_values(ascending=False) st.bar_chart(feat_importance) st.subheader("Exploratory Data Analysis") if st.checkbox('Show Data Summary'): st.write(heart_data.describe()) if st.checkbox('Show Data'): st.write(heart_data) if st.checkbox("Show Correlation Heatmap"): corr_matrix = heart_data.corr() fig,ax = plt.subplots(figsize=(12,12)) sns.heatmap(corr_matrix,annot=True, cmap='coolwarm') st.pyplot(fig) if st.button("Predict"): ypred = rfc.predict(new_data) if ypred[0] == 1: st.write("Prediction Result : Has Heart Disease") else: st.write("Prediction Result: No Heart Disease") # Define sidebar content st.sidebar.title("EPHI") #st.sidebar.image('logo.jpeg', width=150) st.sidebar.write("Made by Mesfin Diro") st.sidebar.write("Addis Ababa University") st.sidebar.write("Computational Data Science") %%writefile apps.py # step 1: loading libraries import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report # 2 define how to load the dataset def load_data(): data = pd.read_csv('data/heart_disease.csv') return data # step 3: building a machine learning model or DL model heart_data = load_data() X = heart_data.drop('target', axis=1) y = heart_data['target'] X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42) rfc = RandomForestClassifier(n_estimators=100) rfc.fit(X_train,y_train) st.title("Heart Disease Prediction") age = st.slider('Age', 0, 100, 25) sex = st.selectbox('Sex', ['Male', 'Female']) cp = st.selectbox("Chest Pain Type", [0, 1, 2, 3]) trestbps = st.slider("Resting Blood Pressure(mmHg)", 0, 200, 120) chol = st.slider("Serum Cholesterol (mg/dl)", 0, 600, 200) fbs = st.selectbox("Fasting Blood Suger > 120 mg/dl", [0,1]) restecg = st.selectbox("Resting Electrocardiographic Results", [0,1,2]) thalach = st.slider("Maximum Heart Rate Achieved", 0,300,150) exang = st.selectbox("Exercise Induction Angina", [0,1]) oldpeak = st.slider("ST depression induced", 0.0, 6.2,3.1,0.1) slope = st.selectbox("Slope of the peak Ex. ST Segment", [0,1,2]) ca = st.selectbox("major vessels color by Flouroscopy",[0,1,2,3]) thal = st.selectbox("Thalassemia", [0,1,2,3]) input_data = { 'age': age, 'sex': 1 if sex=='Male' else 0, 'cp': cp, 'trestbps': trestbps, 'chol': chol, 'fbs': fbs, 'restecg': restecg, 'thalach': thalach, 'exang': exang, 'oldpeak': oldpeak, 'slope': slope, 'ca': ca, 'thal': thal } new_data = np.array(list(input_data.values())).reshape(1,-1) prediction = rfc.predict(new_data) st.write('Prediction', prediction[0]) # define slidebar content st.sidebar.title("EPHI") st.sidebar.write("Developed by collaburation effort team") st.sidebar.write("NDMC Department") about_part= st.sidebar.expander("EPHI", expanded=False) with about_part: st.write(''' ### About NDMC is blalala ### Thanks to ... - [Anwar Taju](https;//mesfind.github.io) ''') st.subheader("Explanatory Analysis") if st.checkbox("Show Data Summary"): st.write(heart_data.describe()) if st.checkbox("Show data"): st.write(heart_data) if st.checkbox('Show correlation Heatmap'): corr_matrix = heart_data.corr() fig,ax = plt.subplots(figsize=(12,12)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm') st.pyplot(fig) if st.checkbox("Feature Importance"): feat_importance = pd.Series(rfc.feature_importances_, index=X.columns) feat_importance = feat_importance.sort_values(ascending=False) st.bar_chart(feat_importance)