Multi-class Text Classification using TensorFlow
3 min readSep 25, 2023
In this blog we will see step by step for how to handle Big data with imbalanced dataset and create multiclass text classifier using TensorFlow.
Steps to Follow:
- Import the data, ensuring it’s accurate and reliable.
- Treat Imbalanced data using Class weights Score.
- Use TensorFlow with SoftMax activation for multiclass classification
- Check accuracy and other metrics.
Import Libraries & Datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#Using Pandas library to load dataset
data = pd.read_csv('/content/drive/MyDrive/DataAnalysisProjects/topic_classification_data.csv')
df.label.value_counts()
Import class weight to treat Imbalanced Datasets
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight ='balanced',
classes=np.unique(df['label']),y = df['label'])
class_weights.sort()
print(class_weights)
Train Test Split 80:20
X_train, X_test = train_test_split(df, test_size=0.2, random_state=111)
Convert Target label into Numerical representation using lookup.StaticHashTable
table = tf.lookup.StaticHashTable(
initializer=tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(['Politics', 'Health', 'Emotion', 'Financial', 'Sport','Science']),
values=tf.constant([0, 1, 2, 3, 4, 5]),
),
default_value=tf.constant(-1),
name="target_encoding"
)
@tf.function
def target(x):
return table.lookup(x)
def show_batch(dataset, size=5):
for batch, label in dataset.take(size):
print(batch.numpy())
print(target(label).numpy())
show_batch(dataset_test,6)
One hot encoding for labels
def fetch(text, labels):
return text, tf.one_hot(target(labels),6)
train_data_f=dataset_train.map(fetch)
test_data_f=dataset_test.map(fetch)
train_data, train_labels = next(iter(train_data_f.batch(5)))
train_data, train_labels
Model Creation using TensorFlow
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
## This is token based pre-trained embedding layer from google
## If we use this then we dont need to add any tokenizing parameter
# This will automatically tokenize
hub_layer = hub.KerasLayer(embedding, output_shape=[128], input_shape=[],
dtype=tf.string, trainable=True)
hub_layer(train_data[:1])
model = tf.keras.Sequential()
model.add(hub_layer)
# Hub layer is embeded layer we are using above
for units in [128, 128, 64 , 32]:
# 4 hidden layers
model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
# Dropout of 30% in each hidden layer
model.add(tf.keras.layers.Dense(6, activation='softmax'))
# Softmax activation for multiclass classification
model.summary()
history = model.fit(train_data_f,
epochs=4,
validation_data=test_data_f,
verbose=1,
class_weight=weights)
## Class_weight = weights for the assigned weight scores for all labels to treat them equal
# When we use class_weights then it use weighted cross entropy func in tensorflow
from sklearn.metrics import classification_report
print(classification_report(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1)))
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1))