matrix-spam-ml/dataset_analysis.ipynb

202 lines
39 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook is mainly to debug the dataset and to see how the data is distributed. It is also used to generate the dataset statistics."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
2022-10-10 17:58:50 +00:00
"import numpy as np # numerical computing\n",
"import pandas as pd # data analysis, working with DataFrames\n",
"import seaborn as sns\n",
"from nltk.corpus import stopwords\n",
"\n",
"def remove_stopwords(input_text):\n",
" \"\"\"\n",
" Function to remove English stopwords from a Pandas Series.\n",
"\n",
" Parameters:\n",
" input_text : text to clean\n",
" Output:\n",
" cleaned Pandas Series\n",
" \"\"\"\n",
" stopwords_list = stopwords.words(\"english\")\n",
" # Some words which might indicate a certain sentiment are kept via a whitelist\n",
" whitelist = [\"n't\", \"not\", \"no\"]\n",
" words = input_text.split()\n",
" clean_words = [\n",
" word\n",
" for word in words\n",
" if (word not in stopwords_list or word in whitelist) and len(word) > 1\n",
" ]\n",
" return \" \".join(clean_words)\n",
"\n",
"# Code for text lowercasing\n",
"def lower_casing_text(text):\n",
"\n",
" \"\"\"\n",
" The function will convert text into lower case.\n",
"\n",
" arguments:\n",
" input_text: \"text\" of type \"String\".\n",
"\n",
" return:\n",
" value: text in lowercase\n",
"\n",
" Example:\n",
" Input : The World is Full of Surprises!\n",
" Output : the world is full of surprises!\n",
"\n",
" \"\"\"\n",
" # Convert text to lower case\n",
" # lower() - It converts all upperase letter of given string to lowercase.\n",
" text = text.lower()\n",
" return text\n",
"\n",
2022-10-10 17:58:50 +00:00
"df = pd.read_csv(\"./input/MatrixData.tsv\", sep='\\t', quoting=csv.QUOTE_NONE, encoding='utf-8')\n",
"df = df.query('(message.str.split().str.len() >= 14 & label == \"ham\") | label == \"spam\"').assign(\n",
" message=df[\"message\"].astype(str),\n",
" label=df[\"label\"].astype(str),\n",
")\n",
"df.drop_duplicates(inplace=True)\n",
"df.reset_index(drop=True, inplace=True)\n",
"df[\"message\"] = df[\"message\"].apply(remove_stopwords)\n",
"df[\"message\"] = df[\"message\"].apply(lower_casing_text)\n",
"df.drop_duplicates(inplace=True)\n",
"df.reset_index(drop=True, inplace=True)\n",
"data = df.sample(frac=1).reset_index(drop=True)\n",
"df.to_csv(\"./input/MatrixData_cleaned.csv\", encoding='utf-8', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: xlabel='label', ylabel='count'>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAGwCAYAAABIC3rIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAArHklEQVR4nO3df1TVdYL/8ddFvRdQQEHhwnTFHxWlgRo5xinN0gHRsamcmtRSR1bNwdykVZZdQ7TZMC0zXdfGNrNmKfsxZZO1HpFSy9CSlkgtUgeH5uRFV8UbOPFD7veP/frZuSv2g4B74f18nHPP4fP+vO/n8/7MOeRzPvcD2Lxer1cAAAAGC/L3AgAAAPyNIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8br6ewEdQVNTk7766iuFhYXJZrP5ezkAAOB78Hq9+vrrrxUXF6egoG+/B0QQfQ9fffWVXC6Xv5cBAABa4Msvv9Rll132rXMIou8hLCxM0v/8DxoeHu7n1QAAgO/D4/HI5XJZ/45/G4Loe7jwMVl4eDhBBABAB/N9HnfhoWoAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMbr6u8F4H8lL3ze30sAAlLJymn+XgKATs6vd4h2796tiRMnKi4uTjabTVu2bPHZb7PZmn2tXLnSmtOvX7+L9i9fvtznOGVlZRo5cqSCg4Plcrm0YsWK9rg8AADQQfg1iGprazVkyBCtW7eu2f3Hjx/3eW3cuFE2m02TJk3ymbds2TKfeffff7+1z+PxKDU1VfHx8SopKdHKlSuVl5enDRs2tOm1AQCAjsOvH5mlp6crPT39kvudTqfP9htvvKGbb75ZAwYM8BkPCwu7aO4FBQUFqq+v18aNG2W32zV48GCVlpZq1apVmj17drPvqaurU11dnbXt8Xi+7yUBAIAOqMM8VF1VVaW33npLGRkZF+1bvny5oqKiNGzYMK1cuVKNjY3WvuLiYo0aNUp2u90aS0tLU3l5uc6cOdPsufLz8xUREWG9XC5X618QAAAIGB0miJ577jmFhYXpjjvu8BmfP3++Nm/erHfffVdz5szRI488okWLFln73W63YmJifN5zYdvtdjd7rpycHJ09e9Z6ffnll618NQAAIJB0mJ8y27hxo6ZOnarg4GCf8aysLOvrpKQk2e12zZkzR/n5+XI4HC06l8PhaPF7AQBAx9Mh7hC99957Ki8v19/93d9959wRI0aosbFRx44dk/Q/zyFVVVX5zLmwfannjgAAgFk6RBA988wzSk5O1pAhQ75zbmlpqYKCghQdHS1JSklJ0e7du9XQ0GDNKSwsVEJCgnr16tVmawYAAB2HX4OopqZGpaWlKi0tlSRVVFSotLRUlZWV1hyPx6NXXnml2btDxcXFWr16tT755BP96U9/UkFBgRYsWKB77rnHip0pU6bIbrcrIyNDBw8e1EsvvaQnn3zS56M2AABgNr8+Q7R//37dfPPN1vaFSJk+fbo2bdokSdq8ebO8Xq8mT5580fsdDoc2b96svLw81dXVqX///lqwYIFP7ERERGj79u3KzMxUcnKyevfurdzc3Ev+yD0AADCPzev1ev29iEDn8XgUERGhs2fPKjw8vM3Ow5/uAJrHn+4A0BI/5N/vDvEMEQAAQFsiiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPH8GkS7d+/WxIkTFRcXJ5vNpi1btvjsnzFjhmw2m89r3LhxPnNOnz6tqVOnKjw8XD179lRGRoZqamp85pSVlWnkyJEKDg6Wy+XSihUr2vrSAABAB+LXIKqtrdWQIUO0bt26S84ZN26cjh8/br1efPFFn/1Tp07VwYMHVVhYqK1bt2r37t2aPXu2td/j8Sg1NVXx8fEqKSnRypUrlZeXpw0bNrTZdQEAgI6lqz9Pnp6ervT09G+d43A45HQ6m9332Wefadu2bfroo4903XXXSZLWrl2r8ePH67HHHlNcXJwKCgpUX1+vjRs3ym63a/DgwSotLdWqVat8wgkAAJgr4J8h2rlzp6Kjo5WQkKC5c+fq1KlT1r7i4mL17NnTiiFJGjt2rIKCgrRv3z5rzqhRo2S32605aWlpKi8v15kzZ5o9Z11dnTwej88LAAB0XgEdROPGjdPzzz+voqIiPfroo9q1a5fS09N1/vx5SZLb7VZ0dLTPe7p27arIyEi53W5rTkxMjM+cC9sX5vxf+fn5ioiIsF4ul6u1Lw0AAAQQv35k9l3uvvtu6+vExEQlJSVp4MCB2rlzp8aMGdNm583JyVFWVpa17fF4iCIAADqxgL5D9H8NGDBAvXv31pEjRyRJTqdTJ06c8JnT2Nio06dPW88dOZ1OVVVV+cy5sH2pZ5McDofCw8N9XgAAoPPqUEH0l7/8RadOnVJsbKwkKSUlRdXV1SopKbHmvPPOO2pqatKIESOsObt371ZDQ4M1p7CwUAkJCerVq1f7XgAAAAhIfg2impoalZaWqrS0VJJUUVGh0tJSVVZWqqamRgsXLtTevXt17NgxFRUV6Re/+IUuv/xypaWlSZKuvvpqjRs3TrNmzdKHH36oPXv2aN68ebr77rsVFxcnSZoyZYrsdrsyMjJ08OBBvfTSS3ryySd9PhIDAABm82sQ7d+/X8OGDdOwYcMkSVlZWRo2bJhyc3PVpUsXlZWV6dZbb9WVV16pjIwMJScn67333pPD4bCOUVBQoKuuukpjxozR+PHjdeONN/r8jqGIiAht375dFRUVSk5O1oMPPqjc3Fx+5B4AAFhsXq/X6+9FBDqPx6OIiAidPXu2TZ8nSl74fJsdG+jISlZO8/cSAHRAP+Tf7w71DBEAAEBbIIgAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8gggAABiPIAIAAMYjiAAAgPEIIgAAYDyCCAAAGI8gAgAAxiOIAACA8QgiAABgPIIIAAAYjyACAADGI4gAAIDxCCIAAGA8vwbR7t27NXHiRMXFxclms2nLli3WvoaGBmVnZysxMVHdu3dXXFycpk2bpq+++srnGP369ZPNZvN5LV++3GdOWVmZRo4cqeDgYLlcLq1YsaI9Lg8AAHQQfg2i2tpaDRkyROvWrbto37lz5/Txxx/roYce0scff6zXXntN5eXluvXWWy+au2zZMh0/ftx63X///dY+j8ej1NRUxcfHq6SkRCtXrlReXp42bNjQptcGAAA6jq7+PHl6errS09Ob3RcREaHCwkKfsX/913/VT3/6U1VWVqpv377WeFhYmJxOZ7PHKSgoUH19vTZu3Ci73a7BgwertLRUq1at0uzZs1vvYgAAQIfVoZ4hOnv2rGw2m3r27Okzvnz5ckVFRWnYsGFauXKlGhsbrX3FxcUaNWqU7Ha7NZaWlqby8nKdOXOm2fPU1dXJ4/H4vAAAQOfl1ztEP8Q333yj7OxsTZ48WeHh4db4/Pnzde211yoyMlIffPCBcnJydPz4ca1atUqS5Ha71b9/f59jxcTEWPt69ep10bny8/O1dOnSNrwaAAAQSDpEEDU0NOiuu+6S1+vV+vXrffZlZWVZXyclJclut2vOnDnKz8+
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.countplot(x=df['label']) # countplot for label"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: ylabel='count'>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkMAAAGdCAYAAAAR5XdZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8o6BhiAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA4i0lEQVR4nO3deXxU5aH/8e8syWQPZk9ICKugIqiIGBcuIIWERZQornXjakXESqzYtKLF1otdtQsGN8QNNwJSrYKVSqwKWqlc9NrLVbQVlQD1J0wMJKB5fn8k5zAzmQlJSDIh5/N+vZ7XzJzzzDPPnDkz853nLOMyxhgBAAA4lDvaHQAAAIgmwhAAAHA0whAAAHA0whAAAHA0whAAAHA0whAAAHA0whAAAHA0whAAAHA0b7Q70NkaGhr0xRdfKDk5WS6XK9rdAQAArWCMUU1NjfLy8uR2d+7YTY8PQ1988YUKCgqi3Q0AANAO27ZtU35+fqc+Ro8PQ8nJyZIaF2ZKSkqUewMAAFrD7/eroKDA/h7vTD0+DFmbxlJSUghDAAAcYbpiF5eo7kBdUVGhYcOG2UGlqKhIL730kj1/zJgxcrlcQeXaa6+NYo8BAEBPE9WRofz8fN11110aNGiQjDF65JFHNG3aNL377rs67rjjJElXX3217rjjDvs+CQkJ0eouAADogaIahqZOnRp0+84771RFRYU2bNhgh6GEhATl5OREo3sAAMABus15hr799ls99dRTqq2tVVFRkT39iSeeUEZGhoYOHary8nLt3bu3xXbq6+vl9/uDCgAAQCRR34H6vffeU1FRkerq6pSUlKSVK1fq2GOPlSRdfPHFKiwsVF5enjZv3qxbbrlFW7Zs0YoVKyK2t3DhQi1YsKCrug8AAI5wLmOMiWYH9u/fr08//VR79uzR8uXL9eCDD6qqqsoORIH+8pe/6KyzztJHH32kAQMGhG2vvr5e9fX19m3r0Lw9e/ZwNBkAAEcIv9+v1NTULvn+jnoYCjV+/HgNGDBA9913X7N5tbW1SkpK0urVqzVx4sRWtdeVCxMAAHSMrvz+7jb7DFkaGhqCRnYCbdq0SZKUm5vbhT0CAAA9WVT3GSovL1dJSYn69OmjmpoaLVu2TOvWrdOaNWu0detWLVu2TJMmTVJ6ero2b96suXPnavTo0Ro2bFg0uw0AAHqQqIahnTt36rLLLtP27duVmpqqYcOGac2aNfrOd76jbdu26ZVXXtE999yj2tpaFRQUqLS0VLfeems0uwwAAHqYbrfPUEdjnyEAAI48jt5nCAAAoCsRhgAAgKMRhgAAgKMRhoBOUlZZrLLK4mh3AwBwCIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhdIgXlpTohSUl0e4GAABtRhgCAACORhgCAACORhgCAACORhgCAACORhgCAACORhgCAACORhgCAACORhgCAACORhgCAACORhhCl6l8uFiVDxdHuxsAAAQhDAEAAEcjDAEAAEcjDAEAAEcjDAEAAEcjDAEAAEcjDAEAAEeLahiqqKjQsGHDlJKSopSUFBUVFemll16y59fV1Wn27NlKT09XUlKSSktLtWPHjij2GAAA9DRRDUP5+fm66667tHHjRr3zzjsaN26cpk2bpv/5n/+RJM2dO1fPP/+8nn32WVVVVemLL77Q9OnTo9llAADQw3ij+eBTp04Nun3nnXeqoqJCGzZsUH5+vh566CEtW7ZM48aNkyQ9/PDDOuaYY7Rhwwadeuqp0egyAADoYbrNPkPffvutnnrqKdXW1qqoqEgbN27UgQMHNH78eLvOkCFD1KdPH61fvz5iO/X19fL7/UEFAAAgkqiODEnSe++9p6KiItXV1SkpKUkrV67Uscceq02bNik2Nla9evUKqp+dna3q6uqI7S1cuFALFizo5F7D8uJDkxqvuCLXWbmk5JB12uOhRydIkmZe9nLHNgwAcJSojwwNHjxYmzZt0ltvvaVZs2bp8ssv1wcffNDu9srLy7Vnzx67bNu2rQN7CwAAepqojwzFxsZq4MCBkqQRI0bob3/7m37729/qggsu0P79+7V79+6g0aEdO3YoJycnYns+n08+n6+zuw0AAHqIqI8MhWpoaFB9fb1GjBihmJgYrV271p63ZcsWffrppyoqKopiDwEAQE8S1ZGh8vJylZSUqE+fPqqpqdGyZcu0bt06rVmzRqmpqZo5c6bKysqUlpamlJQUzZkzR0VFRRxJBgAAOkxUw9DOnTt12WWXafv27UpNTdWwYcO0Zs0afec735Ek3X333XK73SotLVV9fb0mTpyoe++9N5pdBgAAPUxUw9BDDz3U4vy4uDgtWrRIixYt6qIeAQAAp+l2+wwBAAB0JcIQAABwNMIQAABwNMIQ0INMeu5mTXru5mh3AwCOKIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhAADgaIQhNPPqg5P16oOTW6yz+qFJYac/v6REzy8p6YxuHZZFj0/UoscnRuWxb6gs1g2VxVF5bADAoRGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGAACAoxGGYKt6IPxfcLzy4CS98mD4v9/AkWfSygWatHJBu+47ecXdmrzi7g7uEQBEF2EIAAA4GmEIAAA4GmEIAAA4GmEIAAA4GmEIAAA4GmEIAAA4GmEIAAA4GmEIAAA4GmEIAAA4GmHI4f76wBT99YEp0e4GjmCTV/xOk1f8LuD2Ik1esahLHnvK8ic0ZfkTXfJYAHouwhAAAHA0whAAAHA0whAAAHA0whAAAHA0whAAAHC0qIahhQsXauTIkUpOTlZWVpbOOeccbdmyJajOmDFj5HK5gsq1114bpR4DAICeJqphqKqqSrNnz9aGDRv05z//WQcOHNCECRNUW1sbVO/qq6/W9u3b7fKLX/wiSj0GAAA9jTeaD7569eqg20uXLlVWVpY2btyo0aNH29MTEhKUk5PT1d0DAAAO0K32GdqzZ48kKS0tLWj6E088oYyMDA0dOlTl5eXau3dvxDbq6+vl9/uDCgAAQCTdJgw1NDToxhtv1Omnn66hQ4fa0y+++GI9/vjjevXVV1VeXq7HHntMl156acR2Fi5cqNTUVLsUFBR0RfcRxqolJR3a3iNLJ3Roe79dNrHd9/3JMxP1k2faf/+eYPKK30a7CwDQIaK6mSzQ7Nmz9f777+v1118Pmn7NNdfY148//njl5ubqrLPO0tatWzVgwIBm7ZSXl6usrMy+7ff7CUQAACCibhGGrr/+er3wwgt67bXXlJ+f32LdUaNGSZI++uijsGHI5/PJ5/N1Sj8BAEDPE9UwZIzRnDlztHLlSq1bt079+vU75H02bdokScrNze3k3gEAACeIahiaPXu2li1bplWrVik5OVnV1dWSpNTUVMXHx2vr1q1atmyZJk2apPT0dG3evFlz587V6NGjNWzYsGh2HQAA9BBRDUMVFRWSGk+sGOjhhx/WFVdcodjYWL3yyiu65557VFtbq4KCApWWlurWW2+NQm8BAEBPFPXNZC0pKChQVVVVF/UGAAA4Ubc5tB4AACAaCEMAAMDRCEMAAMDRCEMAAMDRCEM92N/um6q/3Tc17Lw37p+iN+6f0sU9AoJNrnxQkysflCRNqVwS5d4AcCrCEAAAcDTCEAAAcDTCEAAAcDTCEAAAcDTCEAAAcDTCEAAAcDTCEAAAcDTCEAAAcDTCEAAAcDRvtDuA9nv/3rM19Lo/Rrsb3dLixybq2u+uaTb9909MlCTNuaT5vFB3PdVY94cXHrpue136XLEk6fFzVndou5Oe+2HTNY8k6cVz7jys9iav+HXTtcP7/TS58n5J0p9KrzmsdlpjyvKn9MJ5F3b64wA48jEyBAA
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.countplot(x=[len(df.loc[i]['message']) for i in range(len(df))])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we have 38695 words in our Dataframe\n",
"the average word count in every sentence is 15\n"
]
},
{
"data": {
"text/plain": [
"([14, 22, 19, 11, 17], 38695, 15)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_words_lengths = [len(df.loc[i]['message'].split()) for i in range(0, len(df))]\n",
"total_length = np.sum(text_words_lengths)\n",
"text_words_mean = int(np.mean(text_words_lengths))\n",
"print('we have ' + str(total_length) + ' words in our Dataframe')\n",
2022-12-10 21:21:53 +00:00
"print('the average word count in every sentence is ' + str(text_words_mean))\n",
"text_words_lengths[:5], total_length, text_words_mean"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "tf",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2022-12-07 23:44:12 +00:00
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
2022-12-07 23:44:12 +00:00
"hash": "86eece18b6898e5d361741678d0e9a4298e9b9ab2411f93d35b863e6e254e93a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}