Use a rust server to serve the model via a highlevel api
This commit is contained in:
parent
460b51bca7
commit
805ee41e43
|
@ -0,0 +1,33 @@
|
|||
name: Build Docker
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
|
||||
env:
|
||||
REGISTRY: ghcr.io
|
||||
IMAGE_NAME: ${{ github.repository }}
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
- name: Login to Container registry
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ${{ env.REGISTRY }}
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
push: true
|
||||
tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
|
|
@ -2,4 +2,5 @@
|
|||
/hyper_tuning
|
||||
/hypertuner_logs
|
||||
/target
|
||||
/logs
|
||||
/logs
|
||||
/wandb
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,2 @@
|
|||
[workspace]
|
||||
members = ["crates/model_server"]
|
|
@ -0,0 +1,13 @@
|
|||
FROM rust:1.64 as builder
|
||||
|
||||
WORKDIR /app
|
||||
COPY ./crates /app
|
||||
COPY ./Cargo.toml /app
|
||||
COPY ./Cargo.lock /app
|
||||
RUN cargo build --release
|
||||
|
||||
ENV MODEL_PATH /app/models/matrix_spam
|
||||
# Copy the model files to the image
|
||||
COPY ./models/spam_keras_1664303305.1441052 /app/models/matrix_spam
|
||||
|
||||
CMD ["./target/release/model_server"]
|
|
@ -0,0 +1,19 @@
|
|||
[package]
|
||||
edition = "2021"
|
||||
name = "model_server"
|
||||
publish = false
|
||||
version = "0.1.0"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
axum = "0.5.16"
|
||||
axum-auth = {version = "0.3", default-features = false, features = ["auth-bearer"]}
|
||||
color-eyre = "0.6.2"
|
||||
once_cell = "1.15.0"
|
||||
serde = {version = "1.0", features = ["derive"]}
|
||||
serde_json = "1.0"
|
||||
tensorflow = {version = "0.19.1", features = ["tensorflow_gpu"]}
|
||||
tokio = {version = "1.0", features = ["full"]}
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = {version = "0.3", features = ["env-filter"]}
|
|
@ -0,0 +1,165 @@
|
|||
use axum::{http::StatusCode, response::IntoResponse, routing::post, Json, Router};
|
||||
use axum_auth::AuthBearer;
|
||||
use color_eyre::eyre::{bail, Result};
|
||||
use once_cell::sync::OnceCell;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io::Write;
|
||||
use std::{fs::OpenOptions, net::SocketAddr};
|
||||
use tensorflow::{Graph, SavedModelBundle, SessionOptions, SessionRunArgs, Tensor};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static GRAPH: OnceCell<Graph> = OnceCell::new();
|
||||
static MODEL: OnceCell<SavedModelBundle> = OnceCell::new();
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
color_eyre::install()?;
|
||||
// initialize tracing
|
||||
tracing_subscriber::fmt::init();
|
||||
info!("Starting up");
|
||||
|
||||
let model_path = match std::env::var("MODEL_PATH") {
|
||||
Ok(val) => val,
|
||||
Err(_) => bail!("Missing MODEL_PATH env var"),
|
||||
};
|
||||
|
||||
let mut graph = Graph::new();
|
||||
let bundle =
|
||||
SavedModelBundle::load(&SessionOptions::new(), &["serve"], &mut graph, model_path)?;
|
||||
GRAPH.set(graph).unwrap();
|
||||
MODEL.set(bundle).unwrap();
|
||||
|
||||
// build our application with a route
|
||||
let app = Router::new()
|
||||
// `GET /test` goes to `test`
|
||||
.route("/test", post(test))
|
||||
// `POST /submit` goes to `submit`
|
||||
.route("/submit", post(submit))
|
||||
.route("/submit_review", post(submit_for_review));
|
||||
|
||||
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
|
||||
info!("listening on {}", addr);
|
||||
axum::Server::bind(&addr)
|
||||
.serve(app.into_make_service())
|
||||
.await
|
||||
.unwrap();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn test(Json(payload): Json<TestData>) -> impl IntoResponse {
|
||||
let bundle = MODEL.get().unwrap();
|
||||
let graph = GRAPH.get().unwrap();
|
||||
let session = &bundle.session;
|
||||
let meta = bundle.meta_graph_def();
|
||||
debug!("Signatures: {:#?}", meta.signatures());
|
||||
let signature = meta
|
||||
.get_signature(tensorflow::DEFAULT_SERVING_SIGNATURE_DEF_KEY)
|
||||
.unwrap();
|
||||
debug!("Inputs: {:#?}", signature.inputs());
|
||||
debug!("Outputs: {:#?}", signature.outputs());
|
||||
let input_info = signature.get_input("input_1").unwrap();
|
||||
let output_info = signature.get_output("output_1").unwrap();
|
||||
|
||||
let input_op = graph
|
||||
.operation_by_name_required(&input_info.name().name)
|
||||
.unwrap();
|
||||
let output_op = graph
|
||||
.operation_by_name_required(&output_info.name().name)
|
||||
.unwrap();
|
||||
|
||||
let tensor: Tensor<String> = Tensor::from(&[payload.input_data.clone()]);
|
||||
let mut args = SessionRunArgs::new();
|
||||
args.add_feed(&input_op, 0, &tensor);
|
||||
|
||||
let out = args.request_fetch(&output_op, 0);
|
||||
|
||||
session
|
||||
.run(&mut args)
|
||||
.expect("Error occurred during calculations");
|
||||
let out_res: f32 = args.fetch(out).unwrap()[0];
|
||||
|
||||
let response = Prediction {
|
||||
input_data: payload.input_data,
|
||||
score: out_res,
|
||||
};
|
||||
|
||||
(StatusCode::OK, Json(response))
|
||||
}
|
||||
|
||||
async fn submit(
|
||||
Json(payload): Json<SubmitData>,
|
||||
AuthBearer(token): AuthBearer,
|
||||
) -> impl IntoResponse {
|
||||
let access_token = match std::env::var("ACCESS_TOKEN") {
|
||||
Ok(val) => val,
|
||||
Err(_) => {
|
||||
error!("Missing ACCESS_TOKEN env var");
|
||||
return StatusCode::INTERNAL_SERVER_ERROR;
|
||||
}
|
||||
};
|
||||
if token != access_token {
|
||||
return StatusCode::UNAUTHORIZED;
|
||||
}
|
||||
|
||||
// TODO implement
|
||||
StatusCode::NOT_IMPLEMENTED
|
||||
}
|
||||
|
||||
async fn submit_for_review(
|
||||
Json(payload): Json<SubmitReview>,
|
||||
AuthBearer(token): AuthBearer,
|
||||
) -> impl IntoResponse {
|
||||
let access_token = match std::env::var("ACCESS_TOKEN") {
|
||||
Ok(val) => val,
|
||||
Err(_) => {
|
||||
error!("Missing ACCESS_TOKEN env var");
|
||||
return StatusCode::INTERNAL_SERVER_ERROR;
|
||||
}
|
||||
};
|
||||
if token != access_token {
|
||||
return StatusCode::UNAUTHORIZED;
|
||||
}
|
||||
|
||||
std::fs::create_dir_all("./data/").unwrap();
|
||||
let file = OpenOptions::new()
|
||||
.write(true)
|
||||
.append(true)
|
||||
.create(true)
|
||||
.open("./data/review.txt");
|
||||
match file {
|
||||
Ok(mut file) => {
|
||||
if let Err(e) = writeln!(file, "{}", payload.input_data) {
|
||||
eprintln!("Couldn't write to file: {}", e);
|
||||
return StatusCode::INTERNAL_SERVER_ERROR;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Couldn't open file: {}", e);
|
||||
return StatusCode::INTERNAL_SERVER_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
StatusCode::OK
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct TestData {
|
||||
input_data: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct Prediction {
|
||||
input_data: String,
|
||||
score: f32,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct SubmitData {
|
||||
input_data: String,
|
||||
spam: bool,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize)]
|
||||
struct SubmitReview {
|
||||
input_data: String,
|
||||
}
|
|
@ -5596,4 +5596,5 @@ ham Greg, can you call me back once you get this?
|
|||
ham I'm writing a bot. I want to be able to provide the bot with the cross signing recovery key from Element, then have it grab all the room keys from the server. Is that currently possible and if so how?
|
||||
ham O thanks, i'll take a look
|
||||
ham Hello all you Element experts, I'm setting up a homelab with Synapse and Element-web. I have read that there are security downsides in hosting both on the same domain. Can someone confirm whether it is still true, even on different subdomains ?
|
||||
ham Greg, can you call me back once you get this?
|
||||
ham Greg, can you call me back once you get this?
|
||||
spam You have won a million dollars! Fill out your bank details here...
|
286
model.py
286
model.py
|
@ -1,286 +0,0 @@
|
|||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import keras_tuner as kt
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
import tensorflow_addons as tfa
|
||||
from nltk.corpus import stopwords
|
||||
from tensorflow import keras
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
|
||||
print("[Step 1/9] Loading data")
|
||||
# Read data
|
||||
data = pd.read_csv('./input/MatrixData', sep='\t')
|
||||
|
||||
|
||||
def remove_stopwords(input_text):
|
||||
'''
|
||||
Function to remove English stopwords from a Pandas Series.
|
||||
|
||||
Parameters:
|
||||
input_text : text to clean
|
||||
Output:
|
||||
cleaned Pandas Series
|
||||
'''
|
||||
stopwords_list = stopwords.words('english')
|
||||
# Some words which might indicate a certain sentiment are kept via a whitelist
|
||||
whitelist = ["n't", "not", "no"]
|
||||
words = input_text.split()
|
||||
clean_words = [word for word in words if (
|
||||
word not in stopwords_list or word in whitelist) and len(word) > 1]
|
||||
return " ".join(clean_words)
|
||||
|
||||
|
||||
# Remove unknown
|
||||
data.dropna(inplace=True)
|
||||
|
||||
# Convert label to something useful
|
||||
|
||||
|
||||
def change_labels(x): return 1 if x == "spam" else 0
|
||||
|
||||
|
||||
data['label'] = data['label'].apply(change_labels)
|
||||
|
||||
# Count by label
|
||||
spam = 0
|
||||
ham = 0
|
||||
|
||||
|
||||
def count_labels(x):
|
||||
if x == 1:
|
||||
global spam
|
||||
spam += 1
|
||||
else:
|
||||
global ham
|
||||
ham += 1
|
||||
return x
|
||||
# .apply(count_labels)
|
||||
#print("Spam: ", spam)
|
||||
#print("Ham: ", ham)
|
||||
|
||||
|
||||
# Remove stopwords
|
||||
data['message'] = data['message'].apply(
|
||||
remove_stopwords)
|
||||
|
||||
# Print unbalanced
|
||||
print(data.groupby('label').describe().T)
|
||||
|
||||
|
||||
#ham_msg = data[data.label == 0]
|
||||
#spam_msg = data[data.label == 1]
|
||||
|
||||
# randomly taking data from ham_msg
|
||||
#ham_msg = ham_msg.sample(n=len(spam_msg)*2, random_state=42)
|
||||
|
||||
#data = pd.concat([ham_msg, spam_msg]).reset_index(drop=True)
|
||||
|
||||
# Balanced
|
||||
print(data.groupby('label').describe().T)
|
||||
|
||||
# Shuffle data
|
||||
data = data.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
# Split data into messages and label sets
|
||||
sentences = data['message'].tolist()
|
||||
labels = data['label'].tolist()
|
||||
|
||||
# Separate out the sentences and labels into training and test sets
|
||||
#training_size = int(len(sentences) * 0.8)
|
||||
training_size = int(len(sentences) * 0.7)
|
||||
training_sentences = sentences[0:training_size]
|
||||
testing_sentences = sentences[training_size:]
|
||||
training_labels = labels[0:training_size]
|
||||
testing_labels = labels[training_size:]
|
||||
|
||||
# Make labels into numpy arrays for use with the network later
|
||||
training_labels_final = np.array(training_labels)
|
||||
testing_labels_final = np.array(testing_labels)
|
||||
|
||||
print("[Step 2/9] Tokenizing data")
|
||||
vocab_size = 1000
|
||||
embedding_dim = 16
|
||||
#embedding_dim = 32
|
||||
#max_length = 120
|
||||
max_length = None
|
||||
trunc_type = 'post'
|
||||
padding_type = 'post'
|
||||
oov_tok = "<OOV>"
|
||||
|
||||
|
||||
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
|
||||
|
||||
tokenizer.fit_on_texts(training_sentences)
|
||||
word_index = tokenizer.word_index
|
||||
|
||||
sequences = tokenizer.texts_to_sequences(training_sentences)
|
||||
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type,
|
||||
truncating=trunc_type)
|
||||
|
||||
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
|
||||
testing_padded = pad_sequences(testing_sequences, maxlen=max_length,
|
||||
padding=padding_type, truncating=trunc_type)
|
||||
|
||||
|
||||
print("[Step 3/9] Prepare callbacks")
|
||||
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
|
||||
|
||||
hypermodel_logdir = "logs/scalars/" + \
|
||||
datetime.now().strftime("%Y%m%d-%H%M%S") + "_hypermodel"
|
||||
hypermodel_tensorboard_callback = keras.callbacks.TensorBoard(
|
||||
log_dir=hypermodel_logdir)
|
||||
|
||||
hypertuner_logdir = "hypertuner_logs/scalars/" + \
|
||||
datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
hypertuner_tensorboard_callback = keras.callbacks.TensorBoard(
|
||||
log_dir=hypertuner_logdir)
|
||||
# Define the checkpoint directory to store the checkpoints.
|
||||
checkpoint_dir = './training_checkpoints'
|
||||
# Define the name of the checkpoint files.
|
||||
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
|
||||
|
||||
progress_bar = tf.keras.callbacks.ProgbarLogger()
|
||||
|
||||
#es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
|
||||
|
||||
print("[Step 4/9] Creating model")
|
||||
|
||||
|
||||
class SpamDectionModel(kt.HyperModel):
|
||||
def build(self, hp):
|
||||
# Tune the number of units in the first Dense layer
|
||||
# Choose an optimal value between 6-512
|
||||
hp_units = hp.Int('units', min_value=6, max_value=512, step=12)
|
||||
hp_dropout = hp.Float('dropout', min_value=.1, max_value=.9, step=.01)
|
||||
hp_l2 = hp.Float('l2', min_value=0.0001, max_value=0.001, step=0.0001)
|
||||
model = tf.keras.Sequential([
|
||||
tf.keras.layers.Embedding(
|
||||
vocab_size, embedding_dim, input_length=max_length),
|
||||
tf.keras.layers.GlobalAveragePooling1D(),
|
||||
tf.keras.layers.Dropout(hp_dropout,),
|
||||
tf.keras.layers.Dense(units=hp_units, activation='relu',
|
||||
kernel_regularizer=tf.keras.regularizers.l2(hp_l2)),
|
||||
# tf.keras.layers.Dense(6, activation='relu',
|
||||
# kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
|
||||
tf.keras.layers.Dropout(hp_dropout,),
|
||||
tf.keras.layers.Dense(1, activation='sigmoid')
|
||||
])
|
||||
# Adam was best so far
|
||||
# tf.keras.optimizers.Nadam() has similar results to Adam but a bit worse. second best
|
||||
hp_learning_rate = hp.Choice('learning_rate', values=[
|
||||
1e-2, 1e-3, 1e-4, 1e-5, ])
|
||||
opt = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
|
||||
# opt = tf.keras.optimizers.Nadam()
|
||||
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
|
||||
optimizer=opt, metrics=['accuracy'])
|
||||
# print(model.summary())
|
||||
|
||||
return model
|
||||
|
||||
|
||||
print("[Step 5/9] Tuning hypervalues")
|
||||
tuner = kt.Hyperband(SpamDectionModel(),
|
||||
objective='val_accuracy',
|
||||
max_epochs=350,
|
||||
factor=3,
|
||||
directory='hyper_tuning',
|
||||
project_name='spam-keras')
|
||||
|
||||
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
|
||||
tuner.search(padded, training_labels_final, epochs=400, verbose=0,
|
||||
validation_data=(testing_padded, testing_labels_final), callbacks=[hypertuner_tensorboard_callback, stop_early, progress_bar])
|
||||
# Get the optimal hyperparameters
|
||||
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
|
||||
|
||||
print(f"""
|
||||
The hyperparameter search is complete. The optimal number of units in the first densely-connected
|
||||
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
|
||||
The optimal dropout rate is {best_hps.get('dropout')} and the optimal l2 rate is {best_hps.get('l2')}.
|
||||
""")
|
||||
|
||||
|
||||
print("[Step 6/9] Fitting initial model")
|
||||
num_epochs = 200
|
||||
model = tuner.hypermodel.build(best_hps)
|
||||
history = model.fit(padded,
|
||||
training_labels_final,
|
||||
epochs=num_epochs,
|
||||
verbose=0,
|
||||
callbacks=[tensorboard_callback, progress_bar],
|
||||
validation_data=(testing_padded, testing_labels_final))
|
||||
|
||||
|
||||
val_acc_per_epoch = history.history['val_accuracy']
|
||||
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 5
|
||||
print('Best epoch: %d' % (best_epoch,))
|
||||
print("Average train loss: ", np.average(history.history['loss']))
|
||||
print("Average test loss: ", np.average(history.history['val_loss']))
|
||||
|
||||
print("[Step 7/9] Building final model")
|
||||
hypermodel = tuner.hypermodel.build(best_hps)
|
||||
hypermodel_history = hypermodel.fit(padded, training_labels_final, verbose=0,
|
||||
epochs=best_epoch,
|
||||
callbacks=[hypermodel_tensorboard_callback,
|
||||
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
|
||||
save_weights_only=True), progress_bar,
|
||||
# es_callback
|
||||
], validation_data=(testing_padded, testing_labels_final)
|
||||
)
|
||||
|
||||
print("Average train loss(hypermodel_history): ",
|
||||
np.average(hypermodel_history.history['loss']))
|
||||
print("Average test loss(hypermodel_history): ",
|
||||
np.average(hypermodel_history.history['val_loss']))
|
||||
|
||||
|
||||
print("[Step 8/9] Saving final model")
|
||||
# Save model
|
||||
hypermodel.save(f"./models/spam_keras_{time.time()}")
|
||||
|
||||
print("[Step 9/9] Testing final model")
|
||||
# Use the model to predict whether a message is spam
|
||||
text_messages = ['Greg, can you call me back once you get this?',
|
||||
'Congrats on your new iPhone! Click here to claim your prize...',
|
||||
'Really like that new photo of you',
|
||||
'Did you hear the news today? Terrible what has happened...',
|
||||
'Attend this free COVID webinar today: Book your session now...',
|
||||
'Are you coming to the party tonight?',
|
||||
'Your parcel has gone missing',
|
||||
'Do not forget to bring friends!',
|
||||
'You have won a million dollars! Fill out your bank details here...',
|
||||
'Looking forward to seeing you again',
|
||||
'oh wow https://github.com/MGCodesandStats/tensorflow-nlp/blob/master/spam%20detection%20tensorflow%20v2.ipynb works really good on spam detection. Guess I go with that as the base model then lol :D',
|
||||
'ayo',
|
||||
'Almost all my spam is coming to my non-gmail address actually',
|
||||
'Oh neat I think I found the sizing sweetspot for my data :D',
|
||||
'would never click on buttons in gmail :D always expecting there to be a bug in gmail that allows js to grab your google credentials :D XSS via email lol. I am too scared for touching spam in gmail',
|
||||
'back to cacophony ',
|
||||
'Room version 11 when',
|
||||
'skip 11 and go straight to 12',
|
||||
'100 events should clear out any events that might be causing a request to fail lol']
|
||||
|
||||
# print(text_messages)
|
||||
|
||||
# Create the sequences
|
||||
padding_type = 'post'
|
||||
sample_sequences = tokenizer.texts_to_sequences(text_messages)
|
||||
fakes_padded = pad_sequences(
|
||||
sample_sequences, padding=padding_type, maxlen=max_length)
|
||||
|
||||
classes = hypermodel.predict(fakes_padded)
|
||||
|
||||
# The closer the class is to 1, the more likely that the message is spam
|
||||
for x in range(len(text_messages)):
|
||||
print(f"Message: \"{text_messages[x]}\"")
|
||||
print(f"Likeliness of spam in percentage: {classes[x][0]:.5f}")
|
||||
print('\n')
|
||||
|
||||
|
||||
#tf.keras.utils.plot_model(model, rankdir="LR", show_shapes=True)
|
120
model_v2.py
120
model_v2.py
|
@ -12,8 +12,9 @@ from nltk.corpus import stopwords
|
|||
from tensorflow import keras
|
||||
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||
from wandb.keras import WandbCallback
|
||||
|
||||
print("TensorFlow version:", tf.__version__)
|
||||
import wandb
|
||||
|
||||
vocab_size = 1000
|
||||
embedding_dim = 16
|
||||
|
@ -46,10 +47,12 @@ progress_bar = tf.keras.callbacks.ProgbarLogger()
|
|||
|
||||
|
||||
class SpamDectionModel(tf.keras.Model):
|
||||
def __init__(self, vocab_size, embedding_dim, max_length, hp_units, hp_dropout, hp_l2):
|
||||
def __init__(self, vectorize_layer, vocab_size, embedding_dim, max_length, hp_units, hp_dropout, hp_l2):
|
||||
super(SpamDectionModel, self).__init__()
|
||||
#self.input_layer = tf.keras.Input(shape=(1,), dtype=tf.string)
|
||||
self.vectorize_layer = vectorize_layer
|
||||
self.embedding = tf.keras.layers.Embedding(
|
||||
vocab_size, embedding_dim, input_length=max_length, name="text_input")
|
||||
vocab_size + 1, embedding_dim, input_length=max_length, name="text_input")
|
||||
self.glob_average_pooling_1d = tf.keras.layers.GlobalAveragePooling1D()
|
||||
self.dropout = tf.keras.layers.Dropout(hp_dropout,)
|
||||
self.dense1 = tf.keras.layers.Dense(units=hp_units, activation='relu',
|
||||
|
@ -61,6 +64,8 @@ class SpamDectionModel(tf.keras.Model):
|
|||
|
||||
@tf.function
|
||||
def call(self, x, training=False):
|
||||
#x = self.input_layer(x)
|
||||
x = self.vectorize_layer(x)
|
||||
x = self.embedding(x)
|
||||
x = self.glob_average_pooling_1d(x)
|
||||
if training:
|
||||
|
@ -72,14 +77,21 @@ class SpamDectionModel(tf.keras.Model):
|
|||
|
||||
|
||||
class SpamDectionHyperModel(kt.HyperModel):
|
||||
def __init__(self, vectorize_layer, vocab_size, embedding_dim, max_length,):
|
||||
super(SpamDectionHyperModel, self).__init__()
|
||||
self.vectorize_layer = vectorize_layer
|
||||
self.vocab_size = vocab_size
|
||||
self.embedding_dim = embedding_dim
|
||||
self.max_length = max_length
|
||||
|
||||
def build(self, hp):
|
||||
# Tune the number of units in the first Dense layer
|
||||
# Choose an optimal value between 6-512
|
||||
hp_units = hp.Int('units', min_value=6, max_value=512, step=12)
|
||||
hp_dropout = hp.Float('dropout', min_value=.1, max_value=.9, step=.01)
|
||||
hp_l2 = hp.Float('l2', min_value=0.0001, max_value=0.001, step=0.0001)
|
||||
model = SpamDectionModel(
|
||||
vocab_size, embedding_dim, max_length, hp_units, hp_dropout, hp_l2)
|
||||
model = SpamDectionModel(self.vectorize_layer,
|
||||
self.vocab_size, self.embedding_dim, self.max_length, hp_units, hp_dropout, hp_l2)
|
||||
# Adam was best so far
|
||||
# tf.keras.optimizers.Nadam() has similar results to Adam but a bit worse. second best
|
||||
hp_learning_rate = hp.Choice('learning_rate', values=[
|
||||
|
@ -115,16 +127,16 @@ def remove_stopwords(input_text):
|
|||
def change_labels(x): return 1 if x == "spam" else 0
|
||||
|
||||
|
||||
def tokenize_data(data, training_sentences, testing_sentences):
|
||||
#tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
|
||||
def tokenize_data(data):
|
||||
# tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
|
||||
|
||||
# tokenizer.fit_on_texts(training_sentences)
|
||||
|
||||
#sequences = tokenizer.texts_to_sequences(training_sentences)
|
||||
# sequences = tokenizer.texts_to_sequences(training_sentences)
|
||||
# padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type,
|
||||
# truncating=trunc_type)
|
||||
|
||||
#testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
|
||||
# testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
|
||||
# testing_padded = pad_sequences(testing_sequences, maxlen=max_length,
|
||||
# padding=padding_type, truncating=trunc_type)
|
||||
|
||||
|
@ -139,23 +151,23 @@ def tokenize_data(data, training_sentences, testing_sentences):
|
|||
vectorize_layer.adapt(data)
|
||||
|
||||
# Create the model that uses the vectorize text layer
|
||||
model = tf.keras.models.Sequential()
|
||||
# model = tf.keras.models.Sequential()
|
||||
# Start by creating an explicit input layer. It needs to have a shape of
|
||||
# (1,) (because we need to guarantee that there is exactly one string
|
||||
# input per batch), and the dtype needs to be 'string'.
|
||||
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
|
||||
# model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
|
||||
|
||||
# The first layer in our model is the vectorization layer. After this
|
||||
# layer, we have a tensor of shape (batch_size, max_len) containing vocab
|
||||
# indices.
|
||||
model.add(vectorize_layer)
|
||||
# model.add(vectorize_layer)
|
||||
|
||||
# Now, the model can map strings to integers, and you can add an embedding
|
||||
# layer to map these integers to learned embeddings.
|
||||
padded = model.predict(training_sentences)
|
||||
testing_padded = model.predict(testing_sentences)
|
||||
# padded = model.predict(training_sentences)
|
||||
# testing_padded = model.predict(testing_sentences)
|
||||
|
||||
return padded, testing_padded # , tokenizer
|
||||
return vectorize_layer # , tokenizer
|
||||
|
||||
|
||||
def load_data():
|
||||
|
@ -175,27 +187,18 @@ def load_data():
|
|||
sentences = data['message'].tolist()
|
||||
labels = data['label'].tolist()
|
||||
|
||||
# Separate out the sentences and labels into training and test sets
|
||||
# training_size = int(len(sentences) * 0.8)
|
||||
training_size = int(len(sentences) * 0.7)
|
||||
training_sentences = sentences[0:training_size]
|
||||
testing_sentences = sentences[training_size:]
|
||||
training_labels = labels[0:training_size]
|
||||
testing_labels = labels[training_size:]
|
||||
|
||||
# Make labels into numpy arrays for use with the network later
|
||||
training_labels_final = np.array(training_labels)
|
||||
testing_labels_final = np.array(testing_labels)
|
||||
padded, testing_padded = tokenize_data(
|
||||
sentences, training_sentences, testing_sentences)
|
||||
return padded, testing_padded, training_labels_final, testing_labels_final, sentences
|
||||
labels_final = np.array(labels)
|
||||
sentences_final = np.array(sentences)
|
||||
vectorize_layer = tokenize_data(sentences)
|
||||
return vectorize_layer, sentences_final, labels_final
|
||||
|
||||
|
||||
def train_hyperparamters(padded, training_labels_final, testing_padded, testing_labels_final, tuner):
|
||||
def train_hyperparamters(data, labels_final, tuner):
|
||||
stop_early = tf.keras.callbacks.EarlyStopping(
|
||||
monitor='val_loss', patience=5)
|
||||
tuner.search(padded, training_labels_final, epochs=5, verbose=1,
|
||||
validation_data=(testing_padded, testing_labels_final), callbacks=[hypertuner_tensorboard_callback, stop_early, progress_bar])
|
||||
tuner.search(data, labels_final, epochs=5, verbose=1, validation_split=0.3,
|
||||
callbacks=[hypertuner_tensorboard_callback, stop_early, progress_bar, WandbCallback()])
|
||||
|
||||
# Get the optimal hyperparameters
|
||||
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
|
||||
|
@ -209,15 +212,15 @@ def train_hyperparamters(padded, training_labels_final, testing_padded, testing_
|
|||
return best_hps
|
||||
|
||||
|
||||
def train_model(padded, training_labels_final, testing_padded, testing_labels_final, best_hps, tuner):
|
||||
def train_model(data, labels_final, best_hps, tuner):
|
||||
num_epochs = 200
|
||||
model = tuner.hypermodel.build(best_hps)
|
||||
history = model.fit(padded,
|
||||
training_labels_final,
|
||||
history = model.fit(data,
|
||||
labels_final,
|
||||
epochs=num_epochs,
|
||||
verbose=0,
|
||||
callbacks=[tensorboard_callback, progress_bar],
|
||||
validation_data=(testing_padded, testing_labels_final))
|
||||
verbose=1, validation_split=0.3,
|
||||
callbacks=[tensorboard_callback,
|
||||
progress_bar, WandbCallback()],)
|
||||
val_acc_per_epoch = history.history['val_accuracy']
|
||||
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 5
|
||||
print('Best epoch: %d' % (best_epoch,))
|
||||
|
@ -225,13 +228,13 @@ def train_model(padded, training_labels_final, testing_padded, testing_labels_fi
|
|||
print("Average test loss: ", np.average(history.history['val_loss']))
|
||||
|
||||
hypermodel = tuner.hypermodel.build(best_hps)
|
||||
hypermodel_history = hypermodel.fit(padded, training_labels_final, verbose=0,
|
||||
epochs=best_epoch,
|
||||
hypermodel_history = hypermodel.fit(data, labels_final, verbose=1,
|
||||
epochs=best_epoch, validation_split=0.3,
|
||||
callbacks=[hypermodel_tensorboard_callback,
|
||||
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
|
||||
save_weights_only=True), progress_bar,
|
||||
save_weights_only=True), progress_bar, WandbCallback(),
|
||||
# es_callback
|
||||
], validation_data=(testing_padded, testing_labels_final)
|
||||
]
|
||||
)
|
||||
|
||||
print("Average train loss(hypermodel_history): ",
|
||||
|
@ -242,7 +245,7 @@ def train_model(padded, training_labels_final, testing_padded, testing_labels_fi
|
|||
return hypermodel
|
||||
|
||||
|
||||
def test_model(sentences, model):
|
||||
def test_model(vectorize_layer, model):
|
||||
# Use the model to predict whether a message is spam
|
||||
text_messages = ['Greg, can you call me back once you get this?',
|
||||
'Congrats on your new iPhone! Click here to claim your prize...',
|
||||
|
@ -268,19 +271,10 @@ def test_model(sentences, model):
|
|||
|
||||
# Create the sequences
|
||||
padding_type = 'post'
|
||||
#sample_sequences = tokenizer.texts_to_sequences(text_messages)
|
||||
# sample_sequences = tokenizer.texts_to_sequences(text_messages)
|
||||
# fakes_padded = pad_sequences(
|
||||
# sample_sequences, padding=padding_type, maxlen=max_length)
|
||||
|
||||
vectorize_layer = tf.keras.layers.TextVectorization(
|
||||
output_mode='int',
|
||||
output_sequence_length=max_length)
|
||||
vectorize_layer.adapt(sentences)
|
||||
vectorize_model = tf.keras.models.Sequential()
|
||||
vectorize_model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
|
||||
vectorize_model.add(vectorize_layer)
|
||||
sequences = vectorize_model.predict(text_messages)
|
||||
classes = model.predict(sequences)
|
||||
classes = model.predict(text_messages)
|
||||
|
||||
# The closer the class is to 1, the more likely that the message is spam
|
||||
for x in range(len(text_messages)):
|
||||
|
@ -290,22 +284,24 @@ def test_model(sentences, model):
|
|||
|
||||
|
||||
def main():
|
||||
print("TensorFlow version:", tf.__version__)
|
||||
# wandb.tensorboard.patch(root_logdir="logs/scalars/")
|
||||
wandb.init(project="matrix-spam", entity="mtrnord")
|
||||
print("[Step 1/6] Loading data")
|
||||
padded, testing_padded, training_labels_final, testing_labels_final, sentences = load_data()
|
||||
model = SpamDectionHyperModel()
|
||||
#print("[Step 2/6] Plotting model")
|
||||
#tf.keras.utils.plot_model(model, rankdir="LR", show_shapes=True)
|
||||
vectorize_layer, data, labels_final = load_data()
|
||||
model = SpamDectionHyperModel(
|
||||
vectorize_layer, vocab_size, embedding_dim, max_length)
|
||||
# print("[Step 2/6] Plotting model")
|
||||
# tf.keras.utils.plot_model(model, rankdir="LR", show_shapes=True)
|
||||
tuner = kt.Hyperband(model, hyperband_iterations=2,
|
||||
objective='val_accuracy',
|
||||
max_epochs=200,
|
||||
directory='hyper_tuning',
|
||||
project_name='spam-keras')
|
||||
print("[Step 3/6] Tuning hypervalues")
|
||||
best_hps = train_hyperparamters(
|
||||
padded, training_labels_final, testing_padded, testing_labels_final, tuner)
|
||||
best_hps = train_hyperparamters(data, labels_final, tuner)
|
||||
print("[Step 4/6] Training model")
|
||||
model = train_model(padded, training_labels_final,
|
||||
testing_padded, testing_labels_final, best_hps, tuner)
|
||||
model = train_model(data, labels_final, best_hps, tuner)
|
||||
|
||||
print("[Step 5/6] Saving model")
|
||||
export_path = f"./models/spam_keras_{time.time()}"
|
||||
|
@ -314,7 +310,7 @@ def main():
|
|||
model.save(export_path)
|
||||
|
||||
print("[Step 6/6] Testing model")
|
||||
test_model(sentences, model)
|
||||
test_model(vectorize_layer, model)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:25c72070889b01d22f200a7341b3b1a468a8c09a1441d4ddda179328b4407cea
|
||||
size 6338
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:de3f320d86418f990246e6077f3679cc72ce7914e0ea47038e64e91c92849cfc
|
||||
size 248724
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8e89306b74b3ea1d58b2b1368899ccb9b5dfc501ba15f76011c1c228b4ac0a46
|
||||
size 262040
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:487f6aa1025c284ed111aefc8ec9ff93dd0c04e2d24007fcbfd284c63a1d8955
|
||||
size 1625
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dff883e7488707e6998153c9b2e04cabeadd2564a3626a23bf33b36265928222
|
||||
size 7976
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:526c09bfacee767ce2cd7fc527eafffdf114be3b2f77f45e00250539e859e11d
|
||||
size 278772
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:65b73789b297e3343d54f17222f61749de3a8afecc8b635016dcedb2c634cc3d
|
||||
size 263443
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:146ae93e2ec7ceec263dcb5dba9d1f97c18aa3efe3d5d5f9f4e327f1c10d53ef
|
||||
size 1733
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dff883e7488707e6998153c9b2e04cabeadd2564a3626a23bf33b36265928222
|
||||
size 7976
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:524b845b4f7794dbbf77aa159be96dc4c78b13df0611e967ad58d8fc5c573d9d
|
||||
size 278772
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c18c6b667bf9e18f64496fd13f13114d62a39a21cabb69606ba5d49f32bca66b
|
||||
size 263443
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8b1ed8b62f43ac290d290889c4a036a85afe193246a63472dd6842b91d1220c7
|
||||
size 1733
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dff883e7488707e6998153c9b2e04cabeadd2564a3626a23bf33b36265928222
|
||||
size 7976
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:855c80af1b87df2469cc77280d9632a4a313e891c8c419eccaead8b4fb946185
|
||||
size 278772
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:92261311d426ede421f32f0a05fad35cacd6809b0b0bba0ffb0629ef84530864
|
||||
size 263443
|
|
@ -0,0 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f0a87ae580f8a44c2cb333e7ee476b26a04d21b1b27d9dcc96286c4182cb2309
|
||||
size 1733
|
674
spam-keras.ipynb
674
spam-keras.ipynb
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue