{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "Qmi36D7ZPY-M" }, "source": [ "# **Day - 6 Titanic Survival Prediction using NAIVE BAYES**" ] }, { "cell_type": "markdown", "metadata": { "id": "Q8lgHC2zPTE4" }, "source": [ "### *Importing basic Libraries*" ] }, { "cell_type": "code", "metadata": { "id": "nKKbpfywIqAq" }, "source": [ "import pandas as pd\n", "import numpy as np" ], "execution_count": 1, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "xfyZYdDaPnJz" }, "source": [ "### *Choose Dataset file from Local Directory*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 73 }, "id": "ki0LIHaOP869", "outputId": "b9ecc9aa-e308-4031-a096-fd77b3f36d4c" }, "source": [ "from google.colab import files\n", "uploaded = files.upload()" ], "execution_count": 2, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "\n", " \n", " \n", " Upload widget is only available when the cell has been executed in the\n", " current browser session. Please rerun this cell to enable.\n", " \n", " " ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saving titanicsurvival.csv to titanicsurvival.csv\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "oEx3VSimP_DF" }, "source": [ "### *Load Dataset*" ] }, { "cell_type": "code", "metadata": { "id": "HQVO5TRBQCGP" }, "source": [ "dataset = pd.read_csv('titanicsurvival.csv')" ], "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Da6ym5z7QHwY" }, "source": [ "### *Summarize Dataset*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Esd6w-GBQLZ5", "outputId": "44a2090a-3b0f-4ee2-c37b-328f185d2bba" }, "source": [ "print(dataset.shape)\n", "print(dataset.head(5))" ], "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(891, 5)\n", " Pclass Sex Age Fare Survived\n", "0 3 male 22.0 7.2500 0\n", "1 1 female 38.0 71.2833 1\n", "2 3 female 26.0 7.9250 1\n", "3 1 female 35.0 53.1000 1\n", "4 3 male 35.0 8.0500 0\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "1AALh-8cS6Jd" }, "source": [ "### *Mapping Text Data to Binary Value*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rcr5RdqtS9iD", "outputId": "24ecd79c-ef7c-4301-be39-ecad5bb7361e" }, "source": [ "income_set = set(dataset['Sex'])\n", "dataset['Sex'] = dataset['Sex'].map({'female': 0, 'male': 1}).astype(int)\n", "print(dataset.head)" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "_j0iPDCWRYAg" }, "source": [ "### *Segregate Dataset into X(Input/IndependentVariable) & Y(Output/DependentVariable)*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "Cqyxx7qQRYp7", "outputId": "f7fa1388-e792-4dab-ef8b-6110b7b7d6f5" }, "source": [ "X = dataset.drop('Survived',axis='columns')\n", "X" ], "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Pclass Sex Age Fare\n", "0 3 1 22.0 7.2500\n", "1 1 0 38.0 71.2833\n", "2 3 0 26.0 7.9250\n", "3 1 0 35.0 53.1000\n", "4 3 1 35.0 8.0500\n", ".. ... ... ... ...\n", "886 2 1 27.0 13.0000\n", "887 1 0 19.0 30.0000\n", "888 3 0 NaN 23.4500\n", "889 1 1 26.0 30.0000\n", "890 3 1 32.0 7.7500\n", "\n", "[891 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeFare
03122.07.2500
11038.071.2833
23026.07.9250
31035.053.1000
43135.08.0500
...............
8862127.013.0000
8871019.030.0000
88830NaN23.4500
8891126.030.0000
8903132.07.7500
\n", "

891 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "X", "summary": "{\n \"name\": \"X\",\n \"rows\": 891,\n \"fields\": [\n {\n \"column\": \"Pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 1,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.526497332334044,\n \"min\": 0.42,\n \"max\": 80.0,\n \"num_unique_values\": 88,\n \"samples\": [\n 0.75,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 49.693428597180905,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 248,\n \"samples\": [\n 11.2417,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "1F1tC2tRRddY", "outputId": "dfa4f8ca-945a-465b-ab63-637411837fb8" }, "source": [ "Y = dataset.Survived\n", "Y" ], "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 0\n", "1 1\n", "2 1\n", "3 1\n", "4 0\n", " ..\n", "886 0\n", "887 1\n", "888 0\n", "889 1\n", "890 0\n", "Name: Survived, Length: 891, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Survived
00
11
21
31
40
......
8860
8871
8880
8891
8900
\n", "

891 rows × 1 columns

\n", "

" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "markdown", "metadata": { "id": "SibVwENGTpsN" }, "source": [ "Finding & Removing NA values from our Features X" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "soVDtqhRTwHZ", "outputId": "e18b9852-5d91-40cb-c46d-94f3402f13a2" }, "source": [ "X.columns[X.isna().any()]" ], "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index(['Age'], dtype='object')" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "metadata": { "id": "0_jCaFTRXQj1" }, "source": [ "X.Age = X.Age.fillna(X.Age.mean())" ], "execution_count": 9, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "nYNPgh4cX0bt" }, "source": [ "### *Test again to check any na value*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QSBSGrNfX3NA", "outputId": "5a65aec5-9704-4c66-8c2c-9012d961c99b" }, "source": [ "X.columns[X.isna().any()]" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Index([], dtype='object')" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "markdown", "metadata": { "id": "R4ngba4SYEue" }, "source": [ "### *Splitting Dataset into Train & Test*" ] }, { "cell_type": "code", "metadata": { "id": "vy9RTlZ4YFyO" }, "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25,random_state =0)" ], "execution_count": 11, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ocZLLSzgYl9V" }, "source": [ "### *Training*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "tPSuaammYz_4", "outputId": "a3a5cc02-a37f-41be-d0d6-514872a484e1" }, "source": [ "from sklearn.naive_bayes import GaussianNB\n", "model = GaussianNB()\n", "model.fit(X_train, y_train)" ], "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "GaussianNB()" ], "text/html": [ "
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "v63bNnciZZYS" }, "source": [ "### *Predicting, wheather Person Survived or Not*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "s17AtjCOZeEZ", "outputId": "fb8f0722-4755-463a-b2c7-3761e564e2b3" }, "source": [ "pclassNo = int(input(\"Enter Person's Pclass number: \"))\n", "gender = int(input(\"Enter Person's Gender 0-female 1-male(0 or 1): \"))\n", "age = int(input(\"Enter Person's Age: \"))\n", "fare = float(input(\"Enter Person's Fare: \"))\n", "person = [[pclassNo,gender,age,fare]]\n", "result = model.predict(person)\n", "print(result)\n", "\n", "if result == 1:\n", " print(\"Person might be Survived\")\n", "else:\n", " print(\"Person might not be Survived\")" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Enter Person's Pclass number: 20\n", "Enter Person's Gender 0-female 1-male(0 or 1): 0\n", "Enter Person's Age: 43\n", "Enter Person's Fare: 4567\n", "[1]\n", "Person might be Survived\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names\n", " warnings.warn(\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "1PdvxG-La4H3" }, "source": [ "### *Prediction for all Test Data*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fShPpJ75a6u0", "outputId": "8d96d8c6-14fc-4c3d-fe83-35f22246d155" }, "source": [ "y_pred = model.predict(X_test)\n", "print(np.column_stack((y_pred,y_test)))" ], "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [0 1]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 0]\n", " [0 1]\n", " [0 1]\n", " [1 1]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [1 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [0 1]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 1]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 1]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [1 1]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [0 1]\n", " [1 0]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [0 1]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 0]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [0 0]\n", " [1 0]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 0]\n", " [0 1]\n", " [1 0]\n", " [1 1]\n", " [0 0]\n", " [0 1]\n", " [1 1]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [1 1]\n", " [0 1]\n", " [0 0]\n", " [0 1]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [0 1]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [1 1]\n", " [0 0]\n", " [0 0]\n", " [0 1]\n", " [0 0]\n", " [1 0]\n", " [0 0]\n", " [0 0]\n", " [0 0]\n", " [0 1]\n", " [1 0]\n", " [1 1]\n", " [0 0]\n", " [1 1]\n", " [1 1]]\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "lFeW_-qYdszc" }, "source": [ "### *Accuracy of our Model*" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HDOFRQ0PdzQS", "outputId": "9e0c4bba-ac94-4065-dac9-2e69d47ddebd" }, "source": [ "from sklearn.metrics import accuracy_score\n", "print(\"Accuracy of the Model: {0}%\".format(accuracy_score(y_test, y_pred)*100))" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy of the Model: 77.57847533632287%\n" ] } ] } ] }