Welcome to The Carpentries Etherpad! This pad is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents. Use of this service is restricted to members of The Carpentries community; this is not for general purpose use (for that, try https://etherpad.wikimedia.org). Users are expected to follow our code of conduct: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html All content is publicly available under the Creative Commons Attribution License: https://creativecommons.org/licenses/by/4.0/ ---------------------------------------------------------------------------- *Welcome to the Workshop! *Links * Curriculum: https://uw-madison-datascience.github.io/machine-learning-novice-sklearn/ * Setup: https://uw-madison-datascience.github.io/machine-learning-novice-sklearn/setup.html * Coding Meetup: https://hub.datascience.wisc.edu/consultation/#dropin * Daily feedback: https://forms.gle/VwqnpE6TtKpZ5iZv9 * Slides * Day1: https://docs.google.com/presentation/d/1AESLK4L_fEC1LQ6Fm3BeNjRMLWU539fP89KIg8t6l-c/edit?usp=sharing * Follow-up resources: https://docs.google.com/presentation/d/1LWUPJ56NkiKnTWZ2g7DylVJEC5AUrquNOpN5nuCslRk/edit?usp=sharing * Day2 slides: https://docs.google.com/presentation/d/1Wdqdojf346CGugh0HaRyKQh2fCsuIXMVbIeiS-7EuTc/edit?usp=sharing * Wrapup slides: https://docs.google.com/presentation/d/1LWUPJ56NkiKnTWZ2g7DylVJEC5AUrquNOpN5nuCslRk/edit?usp=sharing *Sign-in Name, Program/Department/Organization, One thing you are interested in learning today or motivated you to register for this workshop. Note: If you haven't completed the workshop setup (should have folder setup with virtual environment folder inside: Desktop/workshop-ml/intro_ml), please do so now. Setup: https://uw-madison-datascience.github.io/machine-learning-novice-sklearn/setup.html * Chris Endemann (he/him), Data Science Hub + ML+X community leader, instructor * Ryan Bemowski (he/him), Data Science Hub, helper - refresh sklearn knowledge * Caitlin Krause (she/her), AbbVie, interested in gaining ML skills * Zekai Otles (He/him)-DoIT Reseach Cyberinfrastructure * Moshi Fu (He/him), undergraduate in UWM, interested in gaining ML skills * Trisha Adamus (she/her), Ebling Library * Mikki Wilburn (she/her), Agroecology MS student, want to apply ML to my own research * Todd Hayes (he/him) , SMPH-DOM, Tools to implement ML at my work * Brian Shore, Geography, general interest * Lourdes Pratt (She/Her) Data Science AbbVie, interest in learning more ML skills and how those can be applied to large data sets * Dariane Drake - she/her. DoIT/AT/LACE, want to apply ML to T&L data in my work * Stella Guan, undergraduate in uw-madison, interested in ML * Sarah Graves (she/her), Forest and Wildlife Ecology, best practices and starting code for ML * Thomas Nipper, MMI, applying ML to unannotated transcript classification * Chris Lalande (she/her), DoIT Academic Technology, LACE. Getting hands on with some ML, it's been years. * Maddie Topf (she/her), PhD student in Microbiology, future job training * Adam Ross Nelson (he/him), PhD UW-Alum. Teaching Professor Data Science Human Behavior (Psych) * Kaleb Chudacoff (they/he), Non-traditional Data Science Student, job training * * * * * *Notes - Day 1 Virtual environment folder * Desktop/workshop-ml/intro_ml jupyterlab installs as one word, but the command is two words "jupyter lab" !pip install seaborn import seaborn as sns dataset = sns.load_dataset("penguins") print(dataset.shape) # Gives rows and columns dataset.head() # Gives first five rows dataset.dropna(inplace=True) dataset.head() dataset.shape output: (333,7) import matplotlib.pyplot as plt train_data = dataset[:146] # 1st 146 rows x-train = train_data['body_mass_g'] y_train = traindata['bill_depth_mm'] plt.scatter(x_train, y_train) plt.xlabel('mass g') plt.ylabel('depth mm') x_train.shape Output: (146,) import numpy as np x_train = np.array(x_train).reshape(-1,1) x_train.shape # now 2-d array. SKlearn expects inputs/outputs to be 2D. y_train = np.array(y_train).reshape(-1,1) y_train.shape output: (146, 1) from sklearn.linear_model import LinearRegression # define our "estimator" / "model" model = LinearRegression(fit_intercept=True) #train our model using training data lin_regress = model.fit(x_train, y_train) #inspect the trained parameters m = lin_regress.coef_ c = lin_regress.intercept_ print('linear coefs = ' , m, c) Output: linear coefs = [[0.00154247]] [12.63062456] import math from sklearn.metrics import mean_squared_error # generate model predictions y_train_pred = lin_regress.predict(x_train) # calculate the error error = math.sqrt(mean_squared_error(y_train, y_train_pred)) print('train RMSE = ', error) # root mean squared error output: train RMSE = 0.989750418206048 y_train_pred plt.scatter(x_train, y_train, label='input') plt.plot(x_train, y_train_pred, '-', label='fit') plt.xlabel('body mass g') plt.ylabel('bill depth mm') plt.legend() # remaining observations for testing test_data = dataset[146:] x_test = test_data['body_mass_g'] # why Lowercase x rather than capital X: becasue x is one dimensional (we have just one predictor as imput for our model) y_test = test_data[bill_dept_mm'] x_test = np.array(x-test).reshape(-1,1) y_test = np.array(y_test).reshape(-1,1) y_test_pred = lin_regress.predict(x_test) test_error = math.sqrt(mean_squared_error(y_test, y_test_pred)) print('test RMSE =', test_error) test RMSE = 4.4456866 plt.scatter(x_train, y_train, label='input') plt.scatter(x_test, y_test, label='test') plt.plot(x_train, y_train_pred, '-', label='fit') plt.xlabel('body mass g') plt.ylabel('bill depth mm') plt.legend() Key learning point: The first attempt (above) does not use randomization. The dataset seems to be ordered by species, so our test set is bias. from sklearn.model_selection import train_test_split x = dataset['body_mass_g'] y = dataset['bill_depth_mm'] x = np.array(x).reshape(-1,1) y = np.array(y).reshape(-1,1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) **Exercise**: Try to re-implement our univariate regression model using these new train/test sets. Follow these steps: 1. Define your estimator model 2. Train the model using .fit() 3. Get predictions from the model using .predict 4. Calculate RMSE for train/test 5. Plot scatter plot of train/test data, with line of best fit model = LinearRegression(fit_intercept=True) lin_regress = model.fit(x_train, y_train) y_train_pred = lin_regress.predict(x_train) train_error = math.sqrt(mean_squared_error(y_train, y_train_pred)) print("train RMSE =", train_error) # get preds and calculated a RMS error for test data y_test_pred = lin_regress.predict(x_test) test_error = math.sqrt(mean_squared_error(y_test, y_test_pred)) print("test RMSE =", test_error) #scatter plot plt.scatter(x_train, y_train, label='input') plt.scatter(x_test, y_test, label='test') plt.plot(x_train, y_train_pred, '-', label='fit') #plt.plot(x_train, y_train_pred, "rx", label="predictions") plt.xlabel('body mass g') plt.ylabel('bill depth mm') plt.legend() plt.show() sns.pairplot(dataset, vars =['body_mass_g', 'bill_depth_mm'], hue='species', diag_kind = 'kde', markers=['o', 's', 'D']) Density/distribution plots on diagnal Key learning point: Even with randomized train sets, it appears that the parameters chosen (body mass and bill depth) do not identify unique values between all species. This is partialy due to two of the species sharing a common body mass and billd depth while the third group is quite different. New Notebook DS Hub SkLearn Live Space.ipynb import seaborn as sns # data viz library. comes with sandbox datasets dataset = sns.load_dataset('penguins') dataset.head() dataset.sample(3) feature_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] dataset.dropna(subset=feature_names, inplace=True) # he inplace=True parameter in the dropna method modifies the DataFrame in place without creating a new DataFram class_names = dataset['species'].unique() X = dataset[feature_names] y = dataset['species'] # look at X and y to check our work X.sample(3) y.sample(3) # What else might we want to do here before moving forward? Plot some variables! sns.pairplot(dataset, hue='species') # Why scale features? Quick aside Scaling in general is used for two main purposes: (1) put all features on the same scale so you can interpret coefficients (e.g., regression coefs) on the same scale. (2) in Neural nets, a specific form of standardization known as "normalization" is applied. This scales features to range from 0-1. This is helpful because neural nets converge faster when you have features scaled between 0-1. from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # we should probably stratify the train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y) Stratification in train/test splits ensures that the distribution of certain data characteristics (e.g., class labels in classification tasks) is preserved between the training and testing datasets. This is particularly useful when dealing with imbalanced datasets, where one class may have significantly more examples than another. import matplotlib.pyplot as plt fig01 = sns.scatterplot(X_train, x=feature_names[0], y=feature_names[1], hue=dataset['species']) plt.show() dataset.sample(3) sns.scatterplot(data=dataset, x='bill_length_mm', y='bill_depth_mm', hue='species') # Looking at tree diagram: Back to that first exercise we did in our intro slides, the order of the questions here, and branching points, form the "trainable params" of the model from skearn.tree import DecisionTreeClassifer, plot_tree clf = DecisionTreeClassifier(max_depth=2) # shallower trees are easier to interpret. Deeper trees improve accuracy. clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred # how would you measure how "good" these predictions are? Compare to observations from data. We could calculate the percentage of correct predictions # small example working up y_pred[:5] y_test[:5] (y_pred[:5] == y_test[;5]).mean() # gives proportion of correct predictions (y_pred == y_test).mean() clf.score(X_test, y_test) fig = plt.figure(figsize=(12,10)) plot_tree(clf, class_names=class_names, feature_names=feature_names, filled=True, ax=fig.gca()) # Is there a rule of thumb for how many layers are acceptable w/o overfitting? The more data you have, the more "trainable parameters" you can estimate without running into overfitting effects. As we reduce our data size, it becomes more and more likely that the model will simply memorize the data If you'd like a general rule of thumb — aim for 10X as many observations as you have "trainable parameters" (# split points in tree). This rule of thumb is really just intended to be a guidepost. The actual threshold may vary depending on data quality, variability in data, etc. from sklearn.inspection import DecisionBoundaryDisplay f1 = feature_names[0] # Happens to be bill_length_mm f2 = feature_names[3] # Happnes to be body_mass_g clf = DecisionTreeClassifier(max_depth=2) clf.fit(X_train[[f1, f2]], y_train) d = DecisionBoundaryDisplay.from_estimator(clf, X_train[[f1, f2]]) sns.scatterplot(X_train, x=f1, y=f2, hue=y_train, palette="husl") plt.show() trainable params = thresholds decided by tree nodes. informed by training data hyperparameters = things that control structure of the tree before we let the tree "learn" from data # search for optimal params import pandas as pd max_depths = [1, 2, 3, 4, 5] accuracies = [] for d in max_depths: clf = DecisionTreeClassifier(max_depth=d) clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) accuracies.append((d, acc)) acc_df = pd.DataFrame(accuracies, columns=['depth', 'accuracy']) sns.lineplot(acc_df, x='depth', y='accuracy') plt.xlabel('Tree depth') plt.ylabel('Accuracy') *Notes - Day 2 *Sign-in Name, Program/Department/Organization, Any questions you have about yesterday's materials * Chris Endemann (he/him), Data Science Hub + ML+X community leader, instructor * Ryan Bemowsk (he/him), Data Science Hub, helper * Zekai Otles (he/him), DoIT -Research Cyberinfrastructure * Chris Lalande (she/her) - DoIT Academic Technology, LACE * Mikki Wilburn (she/her), Agroecology MS Program * Lourdes Pratt (she/her) Data Science AbbVie * Sarah Graves (she/her), Forest and Wildlife Ecology * Thomas Nipper (he/him), Microbiology * Maddie Topf (she/her), Microbiology * Todd Hayes (he/him), SMPH-DOM * * * * * * * * * * * * * * * * * * * * * * * * * * * Open jupyter lab. Virtual environment folder (default) * Desktop/workshop-ml/intro_ml source intro_ml/bin/activate Use intro_ml kernel within jupyter lab. Open a new notebook. # Copy from lesson plan (https://uw-madison-datascience.github.io/machine-learning-novice-sklearn/03-classification/index.html): # 1) LOAD DATA (if not loaded already) import seaborn as sns dataset = sns.load_dataset('penguins') dataset.head() # 2) Extract the data we need and drop NaNs (if not done already) feature_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] dataset.dropna(subset=feature_names, inplace=True) class_names = dataset['species'].unique() X = dataset[feature_names] y = dataset['species'] # 3) ADD RANDOM NOISE TO X import numpy as np stds = X.std(axis=0).to_numpy() # Generate noise and scale it # Set seed for reproducibility np.random.seed(42) noise = np.random.normal(0, 1, X.shape) # sample numbers from normal distribution scaled_noise = noise * stds # up to 1X_noisy = X + scaled_noise import matplotlib.pyplot as plt fig01 = sns.scatterplot(X, x=feature_names[0], y=feature_names[1], hue=dataset['species'])plt.show() fig02 = sns.scatterplot(X_noisy, x=feature_names[0], y=feature_names[1], hue=dataset['species'])plt.show() # 4) TRAIN/TEST SPLIT from sklearn.model_selection import train_test_split # Comment and uncomment the below two train test split lines of code to see with and without noise. # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)X_train, X_test, y_train, y_test = train_test_split(X_noisy, y, test_size=0.2, random_state=0, stratify=y) # 5) HYPERPARAM TUNING from sklearn.tree import DecisionTreeClassifier import pandas as pd import matplotlib.pyplot as plt max_depths = list(range(1,200)) accuracy = [] for d in max_depths: clf = DecisionTreeClassifier(max_depth=d) clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) accuracy.append((d, acc)) acc_df = pd.DataFrame(accuracy, columns=['depth', 'accuracy']) sns.lineplot(acc_df, x='depth', y='accuracy') plt.xlabel('Tree depth') plt.ylabel('Accuracy') plt.show() ## Clean or data without noise, even complicated models will extract gobal trends rather than overfitting. Moving on to Support Vector Machines (SVM) Standardization -> put features on the same scale. Often times standardization == zscoring, which technically is a specific form of standardization where we scale features to have mean 0, and std = 1. Normalization is another form of standardization, where we scale features from 0 to 1. Any time your algorithm uses distance based calculations, it is importatnt to standardize. Otherwise, features with more variance will control the results Other models include KNN, neural networks Standardization is also valuable for interpretability from sklearn import preprocessing import pandas as pd scalar = preprocessing.StandardScaler() # init zscoring scalar.fit(X_train) # calculate params needed for transformation # Create scaled train/test sets X_train_scaled = pd.DataFrame(scalar.transform(X_train), columns=X_train.columns, index=X_train.index) X_test_scaled = pd.DataFrame(scalar.transform(X_test), columns=X_test.columns, index=X_test.index) from sklearn import svm SVM = svm.SVC(kernel='poly', degree=3, C=1.5) SVM.fit(X_train, y_train) svm_score = SVM.score(X_test_scaled, y_test) print('SVM score: ', svm_score) # Make a decision boundary plot from sklearn.inspection import DecisionBoundaryDisplay x2 = X_train_scaled[[feature_names[0], feature_names[1]]] SVM = svm.SVC(kernel='poly', degree=3, C=1.5) SVM.fit(x2, y_train) DecisionBoundaryDisplay.from_estimator(SVM, x2) sns.scatterplot(x2, x=feature_names[0], y=feature_names[1], hue=dataset['species']) ## Ensembles (https://carpentries-incubator.github.io/machine-learning-novice-sklearn/04-ensemble-methods/index.html) Stacking, bagging and boosting ## New file (Bagging.ipynb) import numpy as np import pandas as pd import seaborn as sns from sklearn.model_selection import train_test_split # load penguins data penguins = sns.load_dataset('penguins') # prepare and define our data and targets feature_names = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] penguins.dropna(subset=feature_names, inplace=True) species_names = penguins['species'].unique() X = penguins[feature_names] y = penguins.species # Split data in training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) print("train size:", X_train.shape) print("test size", X_test.shape) from sklearn.ensemble import RandomForestClassifier # Bagged (bagging) form of Decision Tree from sklearn.tree import plot_tree forest = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=1, random_state=0) forest.fit(X_train, y_train) print(forest.score(X_test, y_test)) ## Unsupervised learning [Clustering] (https://carpentries-incubator.github.io/machine-learning-novice-sklearn/05-clustering/index.html) Create a new notebook (Clustering.ipynb) import sklearn.cluster as skl_cluster import sklearn.datasets as skl_datasets import matplotlib.pyplot as plt #Lets define some functions here to avoid repetitive code def plots_labels(data, labels): tx = data[:, 0] ty = data[:, 1] fig = plt.figure(1, figsize=(4, 4)) plt.scatter(tx, ty, edgecolor='k', c=labels) plt.show() def plot_clusters(data, clusters, Kmean): tx = data[:, 0] ty = data[:, 1] fig = plt.figure(1, figsize=(4, 4)) plt.scatter(tx, ty, s=5, linewidth=0, c=clusters) for cluster_x, cluster_y in Kmean.cluster_centers_: plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x') plt.show() # Create some data to use. We can modify the dataset we use by changing these values. # In particular, we can change the cluster_std to create overlapping data N_true_clusters = 4 data, cluster_id = skl_datasets.make_blobs(n_samples=400, cluster_std=0.75, centers=N_true_clusters, random_state=1) print(data) # help(plots_labels) # This will give a bit of information about this function we ran earlier plots_labels(data, cluster_id) N_pred_clusters = 4 Kmean = skl_cluster.KMeans(n_clusters=N_pred_clusters) Kmean.fit(data) clusters = Kmean.predict(data) # peek at the predictions print(clusters) # help(plot_clusters) plot_clusters(data, clusters, Kmean) # Scoring our clustering results from sklearn.metrics import silhouette_score, silhouette_samples # calculate overall silhouette score overall_silhouette = silhouette_score(data, clusters) print(overall_silhouette) Silhouette score -1 means overlapping, 1 means tight clusters With this dataset, it seems 2 is the ideal cluster size. However, we know that there are actually 4 clusters. In the real world, we might not know the correct number of clusters. Interpretation is very important. Silhouette score is not always the greatest metric to get a good result. Domain knowledge can be much more important here than silhouette score. Side note: random_state is often referred to as a "random seed" or "seed". The term "seed" is related to how computers produce "random" numbers. ## Dimension reduction (https://carpentries-incubator.github.io/machine-learning-novice-sklearn/06-dimensionality-reduction/index.html) Create a new file (DimReduction.ipynb) import numpy as np import matplotlib.pyplot as plt import sklearn.cluster as skl_cluster from sklearn import manifold, decomposition, datasets # Let's define these here to avoid repetitive code def plots_labels(data, labels): tx = data[:, 0] ty = data[:, 1] fig = plt.figure(1, figsize=(4, 4)) plt.scatter(tx, ty, edgecolor='k', c=labels) plt.show() def plot_clusters(data, clusters, Kmean): tx = data[:, 0] ty = data[:, 1] fig = plt.figure(1, figsize=(4, 4)) plt.scatter(tx, ty, s=5, linewidth=0, c=clusters) for cluster_x, cluster_y in Kmean.cluster_centers_: plt.scatter(cluster_x, cluster_y, s=100, c='r', marker='x') plt.show() def plot_clusters_labels(data, labels): tx = data[:, 0] ty = data[:, 1] # with labels fig = plt.figure(1, figsize=(5, 4)) plt.scatter(tx, ty, c=labels, cmap="nipy_spectral", edgecolor='k', label=labels) plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10)) plt.show() from sklearn import datasets features, labels = datasets.load_digits(return_X_y=True, as_frame=True) print(features.shape, labels.shape) print(labels[0:10]) features.head() # Reshape the flattened data back to 8x8 pixel image and plot it # Verification that the flattened data is indeed the image. print(features.iloc[0]) image_1D = features.iloc[0] image_2D = np.array(image_1D).reshape(-1, 8) plt.imshow(image_2D, cmap="gray_r") ## Principal Component Analysis (PCA) is our go-to dimensional reduction technique # The basic idea is to remove features which do not represent distinct information (remove data which is highly corelated or redundant). # When we transform the data, we "linearly combine" original features to preduce new ones. # check feature shape features.shape # Look at first 2 pca features from sklearn import decomposition pca = decomposition.PCA(n_components=2) # Look at first 2 of the newly transformed features x_pca = pca.fit_transform(features) # New shape is 2 features x_pca.shape plots_labels(x_pca, None) # Perform clustering technique on 10 clusters (since we have 10 digits, 0-9) import sklearn.cluster as skl_cluster Kmean = skl_cluster.KMeans(n_clusters=10) # 10 digits Kmean.fit(x_pca) clusters = Kmean.predict(x_pca) plot_clusters(x_pca, clusters, Kmean) # See the true clusters plot_clusters_labels(x_pca, labels) ## t-distributed Stochastic Neighbor Embedding (t-SME) from sklearn import manifold tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) x_tsne = tsne.fit_transform(features) plots_labels(x_tsne, None) # Plot with predicted colors Kmean = skl_cluster.KMeans(n_clusters=10) Kmean.fit(x_tsne) clusters = Kmean.predict(x_tsne) # Plot with expected clusters plot_clusters(x_tsne, clusters, Kmean) plot_clusters_labels(x_tsne, labels) ## Neural Networks (https://carpentries-incubator.github.io/machine-learning-novice-sklearn/07-neural-networks/index.html) Create a new notebook (NeuralNetwork.ipynb) import sklearn.datasets as skl_data data, labels = skl_data.fetch_openml('mnist_784', version=1, return_X_y=True) data = data / 255.0 import sklearn.neural_network as skl_nn mlp = skl_nn.MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, verbose=1, random_state=1) from sklearn.model_selection import train_test_split # Assuming `data` is your feature matrix and `labels` is your target vector X_train, X_test, y_train, y_test = train_test_split( data.values, # Features labels.values, # Labels test_size=0.1, # Reserve 10% of data for testing random_state=42 # For reproducibility ) X_train.shape mlp.fit(X_train, y_train) print("Training set score", mlp.score(X_train, y_train)) print("Testing set score", mlp.score(X_test, y_test)) # To review a confusion matrix of the predictions (confusion matrix is very useful for almost any classification problem) y_test_pred = mlp.predict(X_test) y_test_pred import numpy as np from sklearn.metrics import ConfusionMatrixDisplay ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred)