Sentiment Classifier

In this example, we'll train and evaluate a sentiment classifier on a sample extracted from a 1.6 million tweet dataset available here.

Pre-Requisites

We recommend running this example in a virtual environment for running this example. Also, we recommend using anaconda for machine learning.

Please complete the setup of your machine.
Install Pandas, Keras, Tensorflow

pip install pandas scikit-learn keras tensorflow

Create a folder name data and download the sample file from here into it. Divide this file into two parts: twitter_train_dataset.csv and twitter_test_dataset. In the below program we have taken a 80-20 train to test ratio.

Sample Code

import random
import time

import pandas as pd
from keras import Sequential, layers
from keras.layers import Dropout
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

import markov
from markov.api.schemas.model_recording import SingleTagInferenceRecord, RecordCustomMetric

# GET PROJECT
# we have already created a project for this model with project_id: 4FzUBTJv8f9uPb
# you can create a new project from the MarkovML SDK or the web UI
project = markov.Project.get_by_id("t86dPNewEhKwkT")

# GET DATASET
# We have uploaded the following train and test segments to markov as well with dataset id: 3vRT5Ut6mhPqFGc23
train_data_location = "./data/twitter_train_dataset.csv"  # location of your dataset
test_data_location = "./data/twitter_test_dataset.csv"  # location of your dataset

# read data into dataframe
train_df = pd.read_csv(train_data_location, encoding='latin')
test_df = pd.read_csv(test_data_location, encoding='latin')

# concatenate the data to vectorize both together
data = pd.concat([train_df, test_df])

# train a count vectorizer
tf_vec = CountVectorizer()
tf_vec.fit(data)
x_train, y_train = train_df['text'].values.tolist(), train_df['target'].values
x_test, y_test = test_df['text'].values.tolist(), test_df['target'].values

# transform to vectors
x_train_trans = tf_vec.transform(x_train)
x_test_trans = tf_vec.transform(x_test)

# BUILD MODEL
# Train your MODEL
suffix = int(time.time())
MODEL_NAME = f"Keras Model for Twitter Sentiment Analysis {suffix}"


# build a Keras Network
def _build_model(input_dim):
    model = Sequential()
    model.add(layers.Dense(64, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(32, activation='relu'))
    model.add(Dropout(0.5))
    model.add(layers.Dense(16, activation='relu'))
    model.add(Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam', metrics=['accuracy']
                  )
    return model


# build Keras Model
model = _build_model(x_train_trans.shape[1])

# TRACK THE TRAINING EXPERIMENT USING markovml
# add auto_record from markovml to capture this experiment
markov.keras.auto_record(
    name=MODEL_NAME,
    notes=f"Auto Recording Keras Model with Name: {MODEL_NAME} with Sentence Encoder",
    project_id=project.project_id,
    model_class=markov.ModelClass.TAGGING
)

# Train the model (it will take some time to converge!)
model.fit(x_train_trans, y_train, epochs=50, batch_size=32, verbose=False)

# EVALUATE THE MODEL
# print test accuracy report
y_pred = model.predict(x_test_trans)
orig_copy = y_pred.tolist()
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", acc)

# Register with MarkovML Backend
# Record the results with MarkovML Evaluator
evaluation_recorder = markov.EvaluationRecorder(
    name=f"Sentiment Analysis Keras Model Evaluation {suffix}",
    model_id=model.markov_model_id,
    notes="This model evaluation captures the performance of V1 model"
          " against baseline dataset for sentiment analysis",
    dataset_id="3vRT5Ut6mhPqFGc23"
)

evaluation_recorder.register()


def _get_cost(inferred, actual):
    if actual == inferred:
        return 0
    else:
        return random.randint(2, 5)


urid = 1
for prob, pred, orig, txt in zip(orig_copy, y_pred, y_test, x_test):
    urid = urid + 1
    mi_record = SingleTagInferenceRecord(
        inferred=float(pred[0]),
        actual=float(orig),
        urid=urid,
        score=float(prob[0]),
        custom_metrics=[
            RecordCustomMetric(label="Cost", value=_get_cost(float(pred[0]), float(orig))),
            RecordCustomMetric(label="Probability", value=float(prob[0]))
        ]
    )
    evaluation_recorder.add_record(mi_record)

outcome = evaluation_recorder.finish()
print(outcome)

MNIST Data Classifier

In this example, we will be creating a keras DNN model to identify the images MNIST dataset as numerical digits.
We will take the following steps:

Use an existing project we created for working with MNIST dataset.
Create an experiment to track the training process where we will record the hyper-parameters, loss curve, epoch time, CPU stats etc.
Evaluate the trained model against the test dataset

Pre-Requisites

We recommend running this example in a virtual environment for running this example. Also, we recommend using anaconda for machine learning.

Please complete the setup of your machine.
Install Pandas, Keras, Tensorflow

pip install pandas scikit-learn keras tensorflow

Sample Code

import time

from keras.datasets import mnist
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.utils import to_categorical

import markov
from markov.api.schemas.model_recording import SingleTagInferenceRecord

# Load an existing project using its id. You can find the list of projects here: `app.markovml.com/<workspace_id>/proj`
# This is an optional step, you can directly use the project id in experimentation and evaluation of your model
project_name = "MNIST Project"
try:
	mnist_project = markov.Project.get_by_name(project_name)
except markov.exceptions.ResourceNotFoundException:
	mnist_project = markov.Project(name=project_name)
	mnist_project.register()

# Load your dataset, in this case we are using the keras library to load the dataset. You can load it in any way.
(x_train, y_train), (x_test, y_test) = mnist.load_data()

IMAGE_INPUT_SHAPE = 784

# Minor data pre-processing
# reshape, convert to float and normalize to send standard input into the DNN
x_train = x_train.reshape(-1, IMAGE_INPUT_SHAPE).astype("float32") / 255.0
x_test = x_test.reshape(-1, IMAGE_INPUT_SHAPE).astype("float32") / 255.0

num_classes = 10  # since there are 10 digits in which we are classifying the images to

# convert class vectors to binary class matrices
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)


# build the model that will be used to classify the MNIST images
def _build_model_graph(input_shape=(IMAGE_INPUT_SHAPE,)):
	model = Sequential()
  model.add(Dense(512, activation="relu", input_shape=input_shape))
  model.add(Dense(512, activation="relu"))
  model.add(Dense(10, activation="softmax"))
  model.compile(
    loss="categorical_crossentropy", optimizer=RMSprop(), metrics=["accuracy"]
  )
  
  return model


MODEL_NAME = f"Classification of MNIST Dataset using Keras DNN {int(time.time())}"


# auto_record will automatically track the experiment - including its hyper-parameters, loss curve, epoch time etc.
markov.keras.auto_record(
  name=MODEL_NAME,
  notes="This experiment is used to track the training process of the Keras DNN used for classification of MNIST.",
  project_id=mnist_project.project_id,  # you can simply paste the project_id here as well
)

model = _build_model_graph()

# The training process will automatically be tracked as we used "auto_record" above.
model.fit(x_train, y_train, batch_size=128, epochs=5)

# fetch dataset to be used for evaluation
dataset = markov.dataset.get_by_name(dataset_name="paste_dataset_name_here")

# Now let us evaluate this model against (x_test, y_test)
evaluation_recorder = markov.EvaluationRecorder(
  name=f"Evaluate {MODEL_NAME}",
  model_id=model.markov_model_id,
  project_id=mnist_project.project_id,
  dataset_id=dataset.ds_id   # or directly paste the dataset-id from UI
)

evaluation_recorder.register()

y_pred = model.predict(x_test)

urid = 1
for pred, actual in zip(y_pred, y_test):
  evaluation_record = SingleTagInferenceRecord(
    inferred=pred.argmax().item(),
    actual=actual.argmax().item(),
    score=pred.max().item(),
    urid=urid,
  )
  urid = urid + 1

  evaluation_recorder.add_record(evaluation_record)

outcome = evaluation_recorder.finish()

Make sure to use your own project_name, and paste your own dataset_name in the above code sample.