matrix-spam-ml/bert.ipynb

662 lines
111 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: TF_GPU_ALLOCATOR=cuda_malloc_async\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:21:09.925809: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"2022-12-10 21:21:10.036521: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2022-12-10 21:21:11.247528: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-12-10 21:21:11.251948: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"2022-12-10 21:21:11.252191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n",
"/home/marcel/.conda/envs/tf/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:21:12.482467: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"WARNING:tensorflow:Please fix your imports. Module tensorflow.python.training.tracking.data_structures has been moved to tensorflow.python.trackable.data_structures. The old module will be deleted in version 2.11.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"model\"\n",
"__________________________________________________________________________________________________\n",
" Layer (type) Output Shape Param # Connected to \n",
"==================================================================================================\n",
" text (InputLayer) [(None,)] 0 [] \n",
" \n",
" preprocessing (KerasLayer) {'input_type_ids': 0 ['text[0][0]'] \n",
" (None, 128), \n",
" 'input_word_ids': \n",
" (None, 128), \n",
" 'input_mask': (Non \n",
" e, 128)} \n",
" \n",
" BERT_encoder (KerasLayer) {'default': (None, 109482241 ['preprocessing[0][0]', \n",
" 768), 'preprocessing[0][1]', \n",
" 'pooled_output': ( 'preprocessing[0][2]'] \n",
" None, 768), \n",
" 'sequence_output': \n",
" (None, 128, 768), \n",
" 'encoder_outputs': \n",
" [(None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768), \n",
" (None, 128, 768)]} \n",
" \n",
" dropout (Dropout) (None, 768) 0 ['BERT_encoder[0][13]'] \n",
" \n",
" classifier (Dense) (None, 1) 769 ['dropout[0][0]'] \n",
" \n",
"==================================================================================================\n",
"Total params: 109,483,010\n",
"Trainable params: 109,483,009\n",
"Non-trainable params: 1\n",
"__________________________________________________________________________________________________\n",
"Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/2\n",
"Epoch 1/10\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:21:53.199015: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x424858d0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n",
"2022-12-10 21:21:53.199320: I tensorflow/compiler/xla/service/service.cc:181] StreamExecutor device (0): Host, Default Version\n",
"2022-12-10 21:21:53.212952: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
"2022-12-10 21:21:53.276686: I tensorflow/compiler/jit/xla_compilation_cache.cc:476] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42/42 [==============================] - ETA: 0s - loss: 0.3651 - binary_accuracy: 0.8204"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:27:46.480163: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5625815040 exceeds 10% of free system memory.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42/42 [==============================] - 378s 9s/step - loss: 0.3651 - binary_accuracy: 0.8204 - val_loss: 0.0910 - val_binary_accuracy: 0.9738\n",
"Epoch 2/10\n",
"42/42 [==============================] - ETA: 0s - loss: 0.0524 - binary_accuracy: 0.9850"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:33:39.322284: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5625815040 exceeds 10% of free system memory.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42/42 [==============================] - 353s 8s/step - loss: 0.0524 - binary_accuracy: 0.9850 - val_loss: 0.0860 - val_binary_accuracy: 0.9843\n",
"Epoch 3/10\n",
"42/42 [==============================] - ETA: 0s - loss: 0.0194 - binary_accuracy: 0.9955"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:39:28.636886: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5625815040 exceeds 10% of free system memory.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42/42 [==============================] - 348s 8s/step - loss: 0.0194 - binary_accuracy: 0.9955 - val_loss: 0.0950 - val_binary_accuracy: 0.9791\n",
"Epoch 4/10\n",
"42/42 [==============================] - ETA: 0s - loss: 0.0062 - binary_accuracy: 0.9985"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:45:18.897732: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5625815040 exceeds 10% of free system memory.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42/42 [==============================] - 350s 8s/step - loss: 0.0062 - binary_accuracy: 0.9985 - val_loss: 0.0881 - val_binary_accuracy: 0.9860\n",
"Epoch 5/10\n",
"42/42 [==============================] - ETA: 0s - loss: 0.0037 - binary_accuracy: 0.9993"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2022-12-10 21:51:03.660678: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 5625815040 exceeds 10% of free system memory.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"42/42 [==============================] - 345s 8s/step - loss: 0.0037 - binary_accuracy: 0.9993 - val_loss: 0.0873 - val_binary_accuracy: 0.9843\n",
"Epoch 6/10\n",
"42/42 [==============================] - 349s 8s/step - loss: 0.0018 - binary_accuracy: 0.9993 - val_loss: 0.0896 - val_binary_accuracy: 0.9860\n",
"Epoch 7/10\n",
"42/42 [==============================] - 350s 8s/step - loss: 0.0017 - binary_accuracy: 0.9993 - val_loss: 0.0904 - val_binary_accuracy: 0.9825\n",
"Epoch 8/10\n",
"42/42 [==============================] - 347s 8s/step - loss: 9.7578e-04 - binary_accuracy: 1.0000 - val_loss: 0.0922 - val_binary_accuracy: 0.9843\n",
"Epoch 9/10\n",
"42/42 [==============================] - 350s 8s/step - loss: 7.7726e-04 - binary_accuracy: 1.0000 - val_loss: 0.0928 - val_binary_accuracy: 0.9843\n",
"Epoch 10/10\n",
"42/42 [==============================] - 349s 8s/step - loss: 6.2757e-04 - binary_accuracy: 1.0000 - val_loss: 0.0931 - val_binary_accuracy: 0.9843\n",
"18/18 [==============================] - 41s 2s/step - loss: 0.0931 - binary_accuracy: 0.9843\n",
"Loss: 0.09307406097650528\n",
"Accuracy: 0.9842932224273682\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:Found untraced functions such as restored_function_body, restored_function_body, restored_function_body, restored_function_body, restored_function_body while saving (showing 5 of 366). These functions will not be directly callable after loading.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Assets written to: ./bert_models/1670707256.1814783/assets\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:Assets written to: ./bert_models/1670707256.1814783/assets\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1/1 [==============================] - 2s 2s/step\n",
"Message: \"Greg, can you call me back once you get this?\"\n",
"Likeliness of spam in percentage: 0.00093\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Congrats on your new iPhone! Click here to claim your prize...\"\n",
"Likeliness of spam in percentage: 0.41126\n",
"Vote by AI: Not Spam\n",
"Model failed to predict correctly\n",
"\n",
"\n",
"Message: \"Really like that new photo of you\"\n",
"Likeliness of spam in percentage: 0.04146\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Did you hear the news today? Terrible what has happened...\"\n",
"Likeliness of spam in percentage: 0.00026\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Attend this free COVID webinar today: Book your session now...\"\n",
"Likeliness of spam in percentage: 0.99068\n",
"Vote by AI: Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Are you coming to the party tonight?\"\n",
"Likeliness of spam in percentage: 0.02573\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Your parcel has gone missing\"\n",
"Likeliness of spam in percentage: 0.00568\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Do not forget to bring friends!\"\n",
"Likeliness of spam in percentage: 0.01160\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"You have won a million dollars! Fill out your bank details here...\"\n",
"Likeliness of spam in percentage: 0.00105\n",
"Vote by AI: Not Spam\n",
"Model failed to predict correctly\n",
"\n",
"\n",
"Message: \"Looking forward to seeing you again\"\n",
"Likeliness of spam in percentage: 0.11693\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"oh wow https://github.com/MGCodesandStats/tensorflow-nlp/blob/master/spam%20detection%20tensorflow%20v2.ipynb works really good on spam detection. Guess I go with that as the base model then lol :D\"\n",
"Likeliness of spam in percentage: 0.53543\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"ayo\"\n",
"Likeliness of spam in percentage: 0.43455\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Almost all my spam is coming to my non-gmail address actually\"\n",
"Likeliness of spam in percentage: 0.04262\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Oh neat I think I found the sizing sweetspot for my data :D\"\n",
"Likeliness of spam in percentage: 0.00090\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"would never click on buttons in gmail :D always expecting there to be a bug in gmail that allows js to grab your google credentials :D XSS via email lol. I am too scared for touching spam in gmail\"\n",
"Likeliness of spam in percentage: 0.12625\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"back to cacophony \"\n",
"Likeliness of spam in percentage: 0.16637\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"Room version 11 when\"\n",
"Likeliness of spam in percentage: 0.38619\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"skip 11 and go straight to 12\"\n",
"Likeliness of spam in percentage: 0.15472\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"100 events should clear out any events that might be causing a request to fail lol\"\n",
"Likeliness of spam in percentage: 0.00092\n",
"Vote by AI: Not Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"Message: \"I'll help anyone interested on how to invest and earn $30k, $50k, $100k, $200k or more in just 72hours from the crypto market.But you will have to pay me my commission! when you receive your profit! if interested send me a direct message let's get started or via WhatsApp +1 (605) 9536801\"\n",
"Likeliness of spam in percentage: 0.99976\n",
"Vote by AI: Spam\n",
"Model predicted correctly\n",
"\n",
"\n",
"18 out of 20 are detected correctly\n",
"\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"!pip3 install --quiet tensorflow-text numpy pandas tf-models-official\n",
"%env TF_GPU_ALLOCATOR=cuda_malloc_async\n",
"%load_ext tensorboard\n",
"\n",
"import csv\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"tf.config.set_visible_devices([], 'GPU')\n",
"import tensorflow_hub as hub\n",
"import pandas as pd\n",
"import tensorflow as tf\n",
"import tensorflow_hub as hub\n",
"import tensorflow_models as tfm\n",
"#from official.nlp import optimization # to create AdamW optimizer\n",
"import tensorflow_text as text # needed even if unused\n",
"import time\n",
"import datetime\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"def change_labels(x):\n",
" return 1 if x == \"spam\" else 0\n",
"\n",
"data = pd.read_csv(\n",
" \"./input/MatrixData.tsv\", sep=\"\\t\", quoting=csv.QUOTE_NONE, encoding=\"utf-8\"\n",
")\n",
"\n",
"# Minimum length\n",
"data = data[data[\"message\"].str.split().str.len().gt(18)]\n",
"# Remove unknown\n",
"data.dropna(inplace=True)\n",
"data.reset_index(drop=True, inplace=True)\n",
"data[\"label\"] = data[\"label\"].apply(change_labels)\n",
"\n",
"# Remove stopwords\n",
"#data[\"message\"] = data[\"message\"].apply(remove_stopwords)\n",
"# Shuffle data\n",
"data = data.sample(frac=1).reset_index(drop=True)\n",
"\n",
"# Split data into messages and label sets\n",
"sentences = data[\"message\"].tolist()\n",
"labels = data[\"label\"].tolist()\n",
"\n",
"# Separate out the sentences and labels into training and test sets\n",
"# training_size = int(len(sentences) * 0.8)\n",
"training_size = int(len(sentences) * 0.7)\n",
"training_sentences = sentences[0:training_size]\n",
"testing_sentences = sentences[training_size:]\n",
"training_labels = labels[0:training_size]\n",
"testing_labels = labels[training_size:]\n",
"\n",
"# Make labels into numpy arrays for use with the network later\n",
"test_labels = np.array(testing_labels)\n",
"train_labels = np.array(training_labels)\n",
"train_examples = np.array(training_sentences)\n",
"test_examples = np.array(testing_sentences)\n",
"\n",
"# Build dataset\n",
"AUTOTUNE = tf.data.AUTOTUNE\n",
"batch_size = 32\n",
"\n",
"raw_train_ds = tf.data.Dataset.from_tensor_slices((train_examples,train_labels))\n",
"train_ds = raw_train_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)\n",
"\n",
"raw_val_ds = tf.data.Dataset.from_tensor_slices((test_examples,test_labels))\n",
"val_ds = raw_val_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)\n",
"test_ds = raw_val_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)\n",
"\n",
"# Load the BERT encoder and preprocessing models\n",
"# Alternative https://tfhub.dev/google/electra_small/2\n",
"tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'\n",
"#tfhub_handle_encoder = 'https://tfhub.dev/google/electra_small/2'\n",
"#tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2'\n",
"tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/2'\n",
"\n",
"def build_classifier_model():\n",
" text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')\n",
" preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')\n",
" encoder_inputs = preprocessing_layer(text_input)\n",
" encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')\n",
" outputs = encoder(encoder_inputs)\n",
" net = outputs['pooled_output']\n",
" net = tf.keras.layers.Dropout(0.1)(net)\n",
" #net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)\n",
" net = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(net)\n",
" return tf.keras.Model(text_input, net)\n",
"\n",
"\n",
"classifier_model = build_classifier_model()\n",
"classifier_model.summary()\n",
"# bert_raw_result = classifier_model(tf.constant(sentences))\n",
"# print(tf.sigmoid(bert_raw_result))\n",
"tf.keras.utils.plot_model(classifier_model, show_dtype=True)\n",
"\n",
"#loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)\n",
"loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)\n",
"metrics = tf.metrics.BinaryAccuracy()\n",
"\n",
"epochs = 10\n",
"steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()\n",
"num_train_steps = steps_per_epoch * epochs\n",
"num_warmup_steps = int(0.1*num_train_steps)\n",
"\n",
"init_lr = 3e-5\n",
"linear_decay = tf.keras.optimizers.schedules.PolynomialDecay(\n",
" initial_learning_rate=init_lr,\n",
" end_learning_rate=0,\n",
" decay_steps=num_train_steps)\n",
"warmup_schedule = tfm.optimization.lr_schedule.LinearWarmup(\n",
" warmup_learning_rate = 0,\n",
" after_warmup_lr_sched = linear_decay,\n",
" warmup_steps = num_warmup_steps\n",
")\n",
"x = tf.linspace(0, num_train_steps, 1001)\n",
"y = [warmup_schedule(xi) for xi in x]\n",
"plt.plot(x,y)\n",
"plt.xlabel('Train step')\n",
"plt.ylabel('Learning rate')\n",
"\n",
"\n",
"\n",
"#optimizer = optimization.create_optimizer(init_lr=init_lr,\n",
"# num_train_steps=num_train_steps,\n",
"# num_warmup_steps=num_warmup_steps,\n",
"# optimizer_type='adamw')\n",
"optimizer = tf.keras.optimizers.experimental.Adam(\n",
" learning_rate = warmup_schedule)\n",
"\n",
"\n",
"classifier_model.compile(optimizer=optimizer,\n",
" loss=loss,\n",
" metrics=metrics)\n",
"\n",
"print(f'Training model with {tfhub_handle_encoder}')\n",
"log_dir = \"logs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
"tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)\n",
"history = classifier_model.fit(x=train_ds,\n",
" validation_data=val_ds,\n",
" epochs=epochs,callbacks=[tensorboard_callback])\n",
"\n",
"loss, accuracy = classifier_model.evaluate(test_ds)\n",
"\n",
"print(f'Loss: {loss}')\n",
"print(f'Accuracy: {accuracy}')\n",
"\n",
"\n",
"history_dict = history.history\n",
"\n",
"acc = history_dict['binary_accuracy']\n",
"val_acc = history_dict['val_binary_accuracy']\n",
"loss = history_dict['loss']\n",
"val_loss = history_dict['val_loss']\n",
"\n",
"epochs = range(1, len(acc) + 1)\n",
"fig = plt.figure(figsize=(10, 6))\n",
"fig.tight_layout()\n",
"\n",
"plt.subplot(2, 1, 1)\n",
"# r is for \"solid red line\"\n",
"plt.plot(epochs, loss, 'r', label='Training loss')\n",
"# b is for \"solid blue line\"\n",
"plt.plot(epochs, val_loss, 'b', label='Validation loss')\n",
"plt.title('Training and validation loss')\n",
"# plt.xlabel('Epochs')\n",
"plt.ylabel('Loss')\n",
"plt.legend()\n",
"\n",
"plt.subplot(2, 1, 2)\n",
"plt.plot(epochs, acc, 'r', label='Training acc')\n",
"plt.plot(epochs, val_acc, 'b', label='Validation acc')\n",
"plt.title('Training and validation accuracy')\n",
"plt.xlabel('Epochs')\n",
"plt.ylabel('Accuracy')\n",
"plt.legend(loc='lower right')\n",
"\n",
"\n",
"saved_model_path = f'./bert_models/{time.time()}'\n",
"\n",
"classifier_model.save(saved_model_path, include_optimizer=False)\n",
"\n",
"def test_model(model):\n",
" # Use the model to predict whether a message is spam\n",
" text_messages = [\n",
" \"Greg, can you call me back once you get this?\",\n",
" \"Congrats on your new iPhone! Click here to claim your prize...\",\n",
" \"Really like that new photo of you\",\n",
" \"Did you hear the news today? Terrible what has happened...\",\n",
" \"Attend this free COVID webinar today: Book your session now...\",\n",
" \"Are you coming to the party tonight?\",\n",
" \"Your parcel has gone missing\",\n",
" \"Do not forget to bring friends!\",\n",
" \"You have won a million dollars! Fill out your bank details here...\",\n",
" \"Looking forward to seeing you again\",\n",
" \"oh wow https://github.com/MGCodesandStats/tensorflow-nlp/blob/master/spam%20detection%20tensorflow%20v2.ipynb works really good on spam detection. Guess I go with that as the base model then lol :D\",\n",
" \"ayo\",\n",
" \"Almost all my spam is coming to my non-gmail address actually\",\n",
" \"Oh neat I think I found the sizing sweetspot for my data :D\",\n",
" \"would never click on buttons in gmail :D always expecting there to be a bug in gmail that allows js to grab your google credentials :D XSS via email lol. I am too scared for touching spam in gmail\",\n",
" \"back to cacophony \",\n",
" \"Room version 11 when\",\n",
" \"skip 11 and go straight to 12\",\n",
" \"100 events should clear out any events that might be causing a request to fail lol\",\n",
" \"I'll help anyone interested on how to invest and earn $30k, $50k, $100k, $200k or more in just 72hours from the crypto market.But you will have to pay me my commission! when you receive your profit! if interested send me a direct message let's get started or via WhatsApp +1 (605) 9536801\",\n",
" ]\n",
"\n",
" spam_no_spam = [\n",
" False,\n",
" True,\n",
" False,\n",
" False,\n",
" True,\n",
" False,\n",
" False,\n",
" False,\n",
" True,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" False,\n",
" True,\n",
" ]\n",
"\n",
" # print(text_messages)\n",
"\n",
" # Create the sequences\n",
" #results = tf.sigmoid(model(tf.constant(text_messages)))\n",
" results = model.predict(tf.constant(text_messages))\n",
"\n",
" # The closer the class is to 1, the more likely that the message is spam\n",
" correct = 0\n",
" expected = len(spam_no_spam)\n",
" for x in range(len(text_messages)):\n",
" print(f'Message: \"{text_messages[x]}\"')\n",
" print(f\"Likeliness of spam in percentage: {results[x][0]:.5f}\")\n",
" spam = results[x][0] >= 0.8\n",
" if spam:\n",
" print(\"Vote by AI: Spam\")\n",
" else:\n",
" print(\"Vote by AI: Not Spam\")\n",
"\n",
" if spam_no_spam[x] != spam:\n",
" print(\"Model failed to predict correctly\")\n",
" else:\n",
" correct = correct+1\n",
" print(\"Model predicted correctly\")\n",
" print(\"\\n\")\n",
" print(f\"{correct} out of {expected} are detected correctly\\n\")\n",
"\n",
"\n",
"test_model(classifier_model)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.15 ('tf')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15 (main, Nov 24 2022, 14:31:59) \n[GCC 11.2.0]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "86eece18b6898e5d361741678d0e9a4298e9b9ab2411f93d35b863e6e254e93a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}