matrix-spam-ml/dataset_analysis.ipynb

202 lines
39 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook is mainly to debug the dataset and to see how the data is distributed. It is also used to generate the dataset statistics."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"import numpy as np # numerical computing\n",
"import pandas as pd # data analysis, working with DataFrames\n",
"import seaborn as sns\n",
"from nltk.corpus import stopwords\n",
"\n",
"def remove_stopwords(input_text):\n",
" \"\"\"\n",
" Function to remove English stopwords from a Pandas Series.\n",
"\n",
" Parameters:\n",
" input_text : text to clean\n",
" Output:\n",
" cleaned Pandas Series\n",
" \"\"\"\n",
" stopwords_list = stopwords.words(\"english\")\n",
" # Some words which might indicate a certain sentiment are kept via a whitelist\n",
" whitelist = [\"n't\", \"not\", \"no\"]\n",
" words = input_text.split()\n",
" clean_words = [\n",
" word\n",
" for word in words\n",
" if (word not in stopwords_list or word in whitelist) and len(word) > 1\n",
" ]\n",
" return \" \".join(clean_words)\n",
"\n",
"# Code for text lowercasing\n",
"def lower_casing_text(text):\n",
"\n",
" \"\"\"\n",
" The function will convert text into lower case.\n",
"\n",
" arguments:\n",
" input_text: \"text\" of type \"String\".\n",
"\n",
" return:\n",
" value: text in lowercase\n",
"\n",
" Example:\n",
" Input : The World is Full of Surprises!\n",
" Output : the world is full of surprises!\n",
"\n",
" \"\"\"\n",
" # Convert text to lower case\n",
" # lower() - It converts all upperase letter of given string to lowercase.\n",
" text = text.lower()\n",
" return text\n",
"\n",
"df = pd.read_csv(\"./input/MatrixData.tsv\", sep='\\t', quoting=csv.QUOTE_NONE, encoding='utf-8')\n",
"df = df.query('(message.str.split().str.len() >= 14 & label == \"ham\") | label == \"spam\"').assign(\n",
" message=df[\"message\"].astype(str),\n",
" label=df[\"label\"].astype(str),\n",
")\n",
"df.drop_duplicates(inplace=True)\n",
"df.reset_index(drop=True, inplace=True)\n",
"df[\"message\"] = df[\"message\"].apply(remove_stopwords)\n",
"df[\"message\"] = df[\"message\"].apply(lower_casing_text)\n",
"df.drop_duplicates(inplace=True)\n",
"df.reset_index(drop=True, inplace=True)\n",
"data = df.sample(frac=1).reset_index(drop=True)\n",
"df.to_csv(\"./input/MatrixData_cleaned.csv\", encoding='utf-8', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: xlabel='label', ylabel='count'>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.countplot(x=df['label']) # countplot for label"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot: ylabel='count'>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.countplot(x=[len(df.loc[i]['message']) for i in range(len(df))])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we have 38695 words in our Dataframe\n",
"the average word count in every sentence is 15\n"
]
},
{
"data": {
"text/plain": [
"([14, 22, 19, 11, 17], 38695, 15)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_words_lengths = [len(df.loc[i]['message'].split()) for i in range(0, len(df))]\n",
"total_length = np.sum(text_words_lengths)\n",
"text_words_mean = int(np.mean(text_words_lengths))\n",
"print('we have ' + str(total_length) + ' words in our Dataframe')\n",
"print('the average word count in every sentence is ' + str(text_words_mean))\n",
"text_words_lengths[:5], total_length, text_words_mean"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "tf",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "86eece18b6898e5d361741678d0e9a4298e9b9ab2411f93d35b863e6e254e93a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}