diff --git a/TitanicSurvivalPrediction_NAIVEBAYES.ipynb b/TitanicSurvivalPrediction_NAIVEBAYES.ipynb new file mode 100644 index 0000000..195853a --- /dev/null +++ b/TitanicSurvivalPrediction_NAIVEBAYES.ipynb @@ -0,0 +1,1835 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qmi36D7ZPY-M" + }, + "source": [ + "# **Day - 6 Titanic Survival Prediction using NAIVE BAYES**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Q8lgHC2zPTE4" + }, + "source": [ + "### *Importing basic Libraries*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nKKbpfywIqAq" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xfyZYdDaPnJz" + }, + "source": [ + "### *Choose Dataset file from Local Directory*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "id": "ki0LIHaOP869", + "outputId": "b9ecc9aa-e308-4031-a096-fd77b3f36d4c" + }, + "source": [ + "from google.colab import files\n", + "uploaded = files.upload()" + ], + "execution_count": 2, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving titanicsurvival.csv to titanicsurvival.csv\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oEx3VSimP_DF" + }, + "source": [ + "### *Load Dataset*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HQVO5TRBQCGP" + }, + "source": [ + "dataset = pd.read_csv('titanicsurvival.csv')" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Da6ym5z7QHwY" + }, + "source": [ + "### *Summarize Dataset*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Esd6w-GBQLZ5", + "outputId": "44a2090a-3b0f-4ee2-c37b-328f185d2bba" + }, + "source": [ + "print(dataset.shape)\n", + "print(dataset.head(5))" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(891, 5)\n", + " Pclass Sex Age Fare Survived\n", + "0 3 male 22.0 7.2500 0\n", + "1 1 female 38.0 71.2833 1\n", + "2 3 female 26.0 7.9250 1\n", + "3 1 female 35.0 53.1000 1\n", + "4 3 male 35.0 8.0500 0\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1AALh-8cS6Jd" + }, + "source": [ + "### *Mapping Text Data to Binary Value*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rcr5RdqtS9iD", + "outputId": "24ecd79c-ef7c-4301-be39-ecad5bb7361e" + }, + "source": [ + "income_set = set(dataset['Sex'])\n", + "dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n", + "print(dataset.head)" + ], + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_j0iPDCWRYAg" + }, + "source": [ + "### *Segregate Dataset into X(Input/IndependentVariable) & Y(Output/DependentVariable)*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "Cqyxx7qQRYp7", + "outputId": "f7fa1388-e792-4dab-ef8b-6110b7b7d6f5" + }, + "source": [ + "X = dataset.drop('Survived',axis='columns')\n", + "X" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Pclass Sex Age Fare\n", + "0 3 1 22.0 7.2500\n", + "1 1 0 38.0 71.2833\n", + "2 3 0 26.0 7.9250\n", + "3 1 0 35.0 53.1000\n", + "4 3 1 35.0 8.0500\n", + ".. ... ... ... ...\n", + "886 2 1 27.0 13.0000\n", + "887 1 0 19.0 30.0000\n", + "888 3 0 NaN 23.4500\n", + "889 1 1 26.0 30.0000\n", + "890 3 1 32.0 7.7500\n", + "\n", + "[891 rows x 4 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassSexAgeFare
03122.07.2500
11038.071.2833
23026.07.9250
31035.053.1000
43135.08.0500
...............
8862127.013.0000
8871019.030.0000
88830NaN23.4500
8891126.030.0000
8903132.07.7500
\n", + "

891 rows × 4 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "X", + "summary": "{\n \"name\": \"X\",\n \"rows\": 891,\n \"fields\": [\n {\n \"column\": \"Pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.526497332334044,\n \"min\": 0.42,\n \"max\": 80.0,\n \"num_unique_values\": 88,\n \"samples\": [\n 0.75,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 49.693428597180905,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 248,\n \"samples\": [\n 11.2417,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 458 + }, + "id": "1F1tC2tRRddY", + "outputId": "dfa4f8ca-945a-465b-ab63-637411837fb8" + }, + "source": [ + "Y = dataset.Survived\n", + "Y" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 0\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 0\n", + " ..\n", + "886 0\n", + "887 1\n", + "888 0\n", + "889 1\n", + "890 0\n", + "Name: Survived, Length: 891, dtype: int64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Survived
00
11
21
31
40
......
8860
8871
8880
8891
8900
\n", + "

891 rows × 1 columns

\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SibVwENGTpsN" + }, + "source": [ + "Finding & Removing NA values from our Features X" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "soVDtqhRTwHZ", + "outputId": "e18b9852-5d91-40cb-c46d-94f3402f13a2" + }, + "source": [ + "X.columns[X.isna().any()]" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['Age'], dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0_jCaFTRXQj1" + }, + "source": [ + "X.Age = X.Age.fillna(X.Age.mean())" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nYNPgh4cX0bt" + }, + "source": [ + "### *Test again to check any na value*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QSBSGrNfX3NA", + "outputId": "5a65aec5-9704-4c66-8c2c-9012d961c99b" + }, + "source": [ + "X.columns[X.isna().any()]" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index([], dtype='object')" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R4ngba4SYEue" + }, + "source": [ + "### *Splitting Dataset into Train & Test*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vy9RTlZ4YFyO" + }, + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25,random_state =0)" + ], + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ocZLLSzgYl9V" + }, + "source": [ + "### *Training*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "tPSuaammYz_4", + "outputId": "a3a5cc02-a37f-41be-d0d6-514872a484e1" + }, + "source": [ + "from sklearn.naive_bayes import GaussianNB\n", + "model = GaussianNB()\n", + "model.fit(X_train, y_train)" + ], + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "GaussianNB()" + ], + "text/html": [ + "
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v63bNnciZZYS" + }, + "source": [ + "### *Predicting, wheather Person Survived or Not*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s17AtjCOZeEZ", + "outputId": "fb8f0722-4755-463a-b2c7-3761e564e2b3" + }, + "source": [ + "pclassNo = int(input(\"Enter Person's Pclass number: \"))\n", + "gender = int(input(\"Enter Person's Gender 0-female 1-male(0 or 1): \"))\n", + "age = int(input(\"Enter Person's Age: \"))\n", + "fare = float(input(\"Enter Person's Fare: \"))\n", + "person = [[pclassNo,gender,age,fare]]\n", + "result = model.predict(person)\n", + "print(result)\n", + "\n", + "if result == 1:\n", + " print(\"Person might be Survived\")\n", + "else:\n", + " print(\"Person might not be Survived\")" + ], + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Enter Person's Pclass number: 20\n", + "Enter Person's Gender 0-female 1-male(0 or 1): 0\n", + "Enter Person's Age: 43\n", + "Enter Person's Fare: 4567\n", + "[1]\n", + "Person might be Survived\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1PdvxG-La4H3" + }, + "source": [ + "### *Prediction for all Test Data*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fShPpJ75a6u0", + "outputId": "8d96d8c6-14fc-4c3d-fe83-35f22246d155" + }, + "source": [ + "y_pred = model.predict(X_test)\n", + "print(np.column_stack((y_pred,y_test)))" + ], + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 0]\n", + " [0 1]\n", + " [0 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [1 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 1]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 1]\n", + " [1 0]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 0]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [1 0]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 0]\n", + " [0 1]\n", + " [1 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 1]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 1]\n", + " [0 0]\n", + " [0 1]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 1]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [1 1]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [0 0]\n", + " [1 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 0]\n", + " [0 1]\n", + " [1 0]\n", + " [1 1]\n", + " [0 0]\n", + " [1 1]\n", + " [1 1]]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lFeW_-qYdszc" + }, + "source": [ + "### *Accuracy of our Model*" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HDOFRQ0PdzQS", + "outputId": "9e0c4bba-ac94-4065-dac9-2e69d47ddebd" + }, + "source": [ + "from sklearn.metrics import accuracy_score\n", + "print(\"Accuracy of the Model: {0}%\".format(accuracy_score(y_test, y_pred)*100))" + ], + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy of the Model: 77.57847533632287%\n" + ] + } + ] + } + ] +} \ No newline at end of file