How to build a text classifier using NLTK?

Published on Aug. 22, 2023, 12:19 p.m.

To build a text classifier using NLTK in Python, you can follow these steps:

  1. Install the NLTK library if it’s not already installed on your system.
pip install nltk
  1. Import the necessary libraries and download relevant corpora and datasets.
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('movie_reviews')
  1. Prepare your data by creating labeled data and splitting them into training and testing data.
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
documents = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]
stop_words = set(stopwords.words('english'))
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
if w.lower() not in stop_words:
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
    features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
  1. Train your text classifier using the selected algorithm from NLTK’s classification module.

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
def init(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
SVM_classifier = SklearnClassifier(SVC())
SVM_classifier.train(training_set)
voted_classifier = VoteClassifier(MNB_classifier, SVM_classifier)



5. Test your classifier and get