diff --git a/BreastCancerDetection_VariousMLAlgorithm.ipynb b/BreastCancerDetection_VariousMLAlgorithm.ipynb new file mode 100644 index 0000000..534d9f3 --- /dev/null +++ b/BreastCancerDetection_VariousMLAlgorithm.ipynb @@ -0,0 +1,938 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AhpeX5Dm_eLW" + }, + "source": [ + "# **Day-10_BreastCancerDetection_VariousMLAlgorithm**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8rvHswu2td0Z" + }, + "source": [ + "### *Importing Libraries*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "P-YXxKhu_Uk4" + }, + "source": [ + "import pandas as pd #useful for loading the dataset\n", + "import numpy as np #to perform array\n", + "from matplotlib import pyplot" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mOt3nocbwvZk" + }, + "source": [ + "### *Choose Dataset from Local Directory*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xB2NMwXtw2dG", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "outputId": "ec354929-3ba9-42a1-b9bb-ccf832a4bb1b" + }, + "source": [ + "from google.colab import files\n", + "uploaded = files.upload()" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving data.csv to data.csv\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "69Kx6TycwzHo" + }, + "source": [ + "### *Load Dataset*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7aXWQK9Cw7Dz" + }, + "source": [ + "dataset = pd.read_csv('data.csv')" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "huZ5AaCtxD0p" + }, + "source": [ + "### *Summarize Dataset*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SmD4EDrkxFzq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c8de07bb-c31d-45b6-d7f1-f37549d27d5f" + }, + "source": [ + "print(dataset.shape)\n", + "print(dataset.head(5))" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(569, 33)\n", + " id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n", + "0 842302 M 17.99 10.38 122.80 1001.0 \n", + "1 842517 M 20.57 17.77 132.90 1326.0 \n", + "2 84300903 M 19.69 21.25 130.00 1203.0 \n", + "3 84348301 M 11.42 20.38 77.58 386.1 \n", + "4 84358402 M 20.29 14.34 135.10 1297.0 \n", + "\n", + " smoothness_mean compactness_mean concavity_mean concave points_mean \\\n", + "0 0.11840 0.27760 0.3001 0.14710 \n", + "1 0.08474 0.07864 0.0869 0.07017 \n", + "2 0.10960 0.15990 0.1974 0.12790 \n", + "3 0.14250 0.28390 0.2414 0.10520 \n", + "4 0.10030 0.13280 0.1980 0.10430 \n", + "\n", + " ... texture_worst perimeter_worst area_worst smoothness_worst \\\n", + "0 ... 17.33 184.60 2019.0 0.1622 \n", + "1 ... 23.41 158.80 1956.0 0.1238 \n", + "2 ... 25.53 152.50 1709.0 0.1444 \n", + "3 ... 26.50 98.87 567.7 0.2098 \n", + "4 ... 16.67 152.20 1575.0 0.1374 \n", + "\n", + " compactness_worst concavity_worst concave points_worst symmetry_worst \\\n", + "0 0.6656 0.7119 0.2654 0.4601 \n", + "1 0.1866 0.2416 0.1860 0.2750 \n", + "2 0.4245 0.4504 0.2430 0.3613 \n", + "3 0.8663 0.6869 0.2575 0.6638 \n", + "4 0.2050 0.4000 0.1625 0.2364 \n", + "\n", + " fractal_dimension_worst Unnamed: 32 \n", + "0 0.11890 NaN \n", + "1 0.08902 NaN \n", + "2 0.08758 NaN \n", + "3 0.17300 NaN \n", + "4 0.07678 NaN \n", + "\n", + "[5 rows x 33 columns]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-MFwV-9k3Tu1" + }, + "source": [ + "### *Mapping Class String Values to Numbers*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yJvwzEfF3ZMe", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "7138e1d5-0f52-4a8d-80e1-63f3280ea23f" + }, + "source": [ + "dataset['diagnosis'] = dataset['diagnosis'].map({'B': 0, 'M': 1}).astype(int)\n", + "print(dataset.head)" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0qgR6rGRxH5y" + }, + "source": [ + "### *Segregate Dataset into X(Input/IndependentVariable) & Y(Output/DependentVariable)*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8qOVIILpxefB", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a50556fe-21af-46eb-ff7e-056be4e2d174" + }, + "source": [ + "X = dataset.iloc[:, 2:32].values\n", + "X" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,\n", + " 1.189e-01],\n", + " [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,\n", + " 8.902e-02],\n", + " [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,\n", + " 8.758e-02],\n", + " ...,\n", + " [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,\n", + " 7.820e-02],\n", + " [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,\n", + " 1.240e-01],\n", + " [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,\n", + " 7.039e-02]])" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TE6LNAwmxkBn", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "93adc199-d960-4cdd-9d42-3ad978584020" + }, + "source": [ + "Y = dataset.iloc[:,1].values\n", + "Y" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,\n", + " 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,\n", + " 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,\n", + " 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,\n", + " 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,\n", + " 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,\n", + " 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,\n", + " 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1,\n", + " 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,\n", + " 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,\n", + " 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,\n", + " 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,\n", + " 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,\n", + " 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", + " 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,\n", + " 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0])" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oOzExtMjxmup" + }, + "source": [ + "### *Splitting Dataset into Train & Test*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uJXcK2PHxqJ9" + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)" + ], + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "89y8rh-3yv15" + }, + "source": [ + "### *Feature Scaling*\n", + "### we scale our data to make all the features contribute equally to the result\n", + "###Fit_Transform - fit method is calculating the mean and variance of each of the features present in our data\n", + "###Transform - Transform method is transforming all the features using the respective mean and variance,\n", + "###We want our test data to be a completely new and a surprise set for our model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ehsC_5BSy-Pa" + }, + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "sc = StandardScaler()\n", + "X_train = sc.fit_transform(X_train)\n", + "X_test = sc.transform(X_test)" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X1gsZ3YZ51gz" + }, + "source": [ + "### *Validating some ML algorithm by its accuracy - Model Score*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ekrjJPx_5-rJ" + }, + "source": [ + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.svm import SVC\n", + "\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.model_selection import StratifiedKFold" + ], + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "7fmWEBDq6fUM" + }, + "source": [ + "models = []\n", + "models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))\n", + "models.append(('LDA', LinearDiscriminantAnalysis()))\n", + "models.append(('KNN', KNeighborsClassifier()))\n", + "models.append(('CART', DecisionTreeClassifier()))\n", + "models.append(('NB', GaussianNB()))\n", + "models.append(('SVM', SVC(gamma='auto')))" + ], + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "r0dYFpqw6iXs", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 923 + }, + "outputId": "c7d83a67-7567-499b-b99c-3792b5c0f9c5" + }, + "source": [ + "results = []\n", + "names = []\n", + "res = []\n", + "for name, model in models:\n", + " kfold = StratifiedKFold(n_splits=10, random_state=None)\n", + " cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')\n", + " results.append(cv_results)\n", + " names.append(name)\n", + " res.append(cv_results.mean())\n", + " print('%s: %f' % (name, cv_results.mean()))\n", + "\n", + "pyplot.ylim(.900, .999)\n", + "pyplot.bar(names, res, color ='maroon', width = 0.6)\n", + "\n", + "pyplot.title('Algorithm Comparison')\n", + "pyplot.show()" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py:1256: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "LR: 0.981285\n", + "LDA: 0.957863\n", + "KNN: 0.964839\n", + "CART: 0.929568\n", + "NB: 0.941417\n", + "SVM: 0.979014\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ospEktZ3_KgQ" + }, + "source": [ + "### *Training & Prediction using the algorithm with high accuracy*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zbi3Uvd0_Yn7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c3532d24-4239-4945-daee-c3d5ba95fcda" + }, + "source": [ + "from sklearn.svm import SVC\n", + "model.fit(X_train, y_train)\n", + "y_pred = model.predict(X_test)\n", + "print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 1]\n", + " [1 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 1]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]]\n" + ] + } + ] + } + ] +} \ No newline at end of file