Not So Random Stuff on Big Data: Machine Learning: Natural Language Processing Using NLTK

So in the previous post, I explored the use of nltk python module and the use of Naive Bayes classifier to classify or predict documents. The results of the classification were quite good - with 11 out of 12 documents being correctly classified. The incorrectly classified document was from George Bush Jr., and it was classified as being from Harry Truman. Naive Bayes used conditional probability to arrive at its result, using the probabilities of the different n-grams occurring in the significant set of n-grams of each of the Presidents.

Based on casual observations, I had noticed that the documents from Harry Truman were exceptionally long and the feature set on n-grams was consequently quite large, with a significant overlap. George Bush Jr's documents on the other had were not so long and had a variety of n-grams across the documents. So I thought of taking a different approach - that of using Jaccard similarity of the n-grams (see Mining of Massive Datasets).

And apparently that turned out to be a better approach - and got us 12 out of 12 correct classification of the documents. The program is similar to the previous one with a very small change.

Python Code

#-----------------------------------------------
# Import pre-amble
#-----------------------------------------------
import sys
import os
import re
import pickle
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
#from nltk.classify import NaiveBayesClassifier
#-----------------------------------------------

#-----------------------------------------------
def print_script_usage():
''' Prints usage of this script.

Script requires two parameters.
First parameter (referred to as training_data_dir)
is a top-level directory name that contains
sub-directories of documents for "training/learning".
The name of each sub-directory is considered a
"label" (classification) and all documents
under that sub-directory are classified
as having that label (or classification).

The second parameter is a directory that
contains test documents that need to be
classified.
'''
print "\n\nUsage: " + script_name + " <input_train_data_dir> <test_data_dir>\n\n"
#-----------------------------------------------

#-----------------------------------------------
def validate_dir(dir_name):
''' Checks if input dir is a valid an existing directory.'''
if not os.path.isdir(dir_name):
print dir_name + " is not a directory."
print_script_usage()
sys.exit(1)
#-----------------------------------------------

#-----------------------------------------------
def get_featureset(file, single_words=False, bigrams=True, trigrams=True):
'''Function to extract featureset from a file.

This is the main function. It takes a file
as input parameter and generates a featureset for
the file.
The featureset consists of a set (dictionary) of
key/value pairs where the key is a single word,
bigram or trigram from the text in the file and
the value is the boolean value "True".
'''
#-----------------------------------------------
# Word tokenization of input file
#-----------------------------------------------
try:
f = open(file)
all_text = f.readlines()
f.close()
except:
print "Error in opening file " + file + "."
print sys.exc_info()
all_text = " ".join(all_text).lower()
wp_tokenizer = WordPunctTokenizer()
tokens = wp_tokenizer.tokenize(all_text)
english_stopwords = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in english_stopwords and len(token) > 4]
total_word_tokens = len(filtered_tokens)
n_gram_limit = int(total_word_tokens*0.1)
#-----------------------------------------------
# Bigrams from word tokens
#-----------------------------------------------
if bigrams == True:
bigram_finder = BigramCollocationFinder.from_words(filtered_tokens, 5)
bigrams_chi_sq = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n_gram_limit)
bigrams_raw_freq = bigram_finder.nbest(BigramAssocMeasures.raw_freq, n_gram_limit)
bigrams_likelihood_ratio = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, n_gram_limit)
bigrams_poisson_stirling = bigram_finder.nbest(BigramAssocMeasures.poisson_stirling, n_gram_limit)
else:
bigrams_chi_sq = []
bigrams_raw_freq = []
bigrams_likelihood_ratio = []
bigrams_poisson_stirling = []
#-----------------------------------------------
# Trigrams from word tokens
#-----------------------------------------------
if trigrams == True:
trigram_finder = TrigramCollocationFinder.from_words(filtered_tokens)
trigrams_chi_sq = trigram_finder.nbest(TrigramAssocMeasures.chi_sq, n_gram_limit)
trigrams_raw_freq = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, n_gram_limit)
trigrams_likelihood_ratio = trigram_finder.nbest(TrigramAssocMeasures.likelihood_ratio, n_gram_limit)
trigrams_poisson_stirling = trigram_finder.nbest(TrigramAssocMeasures.poisson_stirling, n_gram_limit)
else:
trigrams_chi_sq = []
trigrams_raw_freq = []
trigrams_likelihood_ratio = []
trigrams_poisson_stirling = []
#-----------------------------------------------
# Consolidated list of words, bigrams and trigrams
#-----------------------------------------------
if single_words == False:
filtered_tokens = []

all_tokens = list(set(filtered_tokens + bigrams_chi_sq + bigrams_raw_freq + bigrams_likelihood_ratio + bigrams_poisson_stirling + trigrams_chi_sq + trigrams_raw_freq + trigrams_likelihood_ratio + trigrams_poisson_stirling))
#-----------------------------------------------
# Featureset for the input file
# The featureset consists of a key-value pair
# where key is the word, bigram or trigram and
# the value is True.
#-----------------------------------------------
return dict([(token, True) for token in all_tokens if token != tuple()])
#-----------------------------------------------

#-----------------------------------------------
script_name = os.path.basename(sys.argv[0]) or 'this_script'

training_data_dir, test_data_dir = 'C:\\Users\\jthakrar\\Jayesh\\Reference\\Machine-Learning\\Presidential-Speeches\\economic_reports\\Train', 'C:\\Users\\jthakrar\\Jayesh\\Reference\\Machine-Learning\\Presidential-Speeches\\economic_reports\\Test'
training_data_dir, test_data_dir = sys.argv[1], sys.argv[2]
validate_dir(training_data_dir)
validate_dir(test_data_dir)
#-----------------------------------------------

#-----------------------------------------------
print "\n\n\n\n**********************************************************************"
print "\t\t\tT R A I N I N G"
print "**********************************************************************"
print "\n\nProcessing training data from:", training_data_dir

labels = [dir_name for dir_name in os.listdir(training_data_dir) if os.path.join(training_data_dir, dir_name)]

training_featureset = []

for label in labels:
dir = os.path.join(training_data_dir, label)
if not os.path.isdir(dir):
print "\tUnexpected error in resolving directory - " + dir
sys.exit(1)
print "\nProcessing sub-directory: " + label
for file in os.listdir(dir):
file_path = os.path.join(dir, file)
if not os.path.isfile(file_path):
print "\tfile " + file + "(" + file_path + ") does not seem to be a file"
continue
featureset = get_featureset(file_path)
print "\tCompleted file: " + file
#print "file = " + file + " label = " + label + " featureset length = " + str(len(featureset))
labeled_featureset = tuple([featureset, label])
training_featureset.append(labeled_featureset)

#-----------------------
# This is where we deviate from the NLTK approach Instead of using the Naive Bayes Algorithm,
# we combine the n-grams for the same label across different documents to get a single n-gram set per label.
# This becomes the "training_data" set. We will apply the Jacardina similarity test to each test document to predict
# the label for the test document.
#-----------------------
training_data = {}
for featureset, label in training_featureset:
label_set = set(feature for feature in featureset)
#-----------------------
# Add each training document's tokens to the training data dictionary.
# Note that we need to "append" the new training document's tokens to the label's existing tokenset
#-----------------------
if label in training_data:
training_data[label] = training_data[label].union(label_set)
else:
training_data[label] = label_set

#-----------------------------------------------

#-----------------------------------------------
print "\n\n\n\n**********************************************************************"
print "\t\t\tT E S T I N G"
print "**********************************************************************"
print "\nProcessing test data from:", test_data_dir
for file in os.listdir(test_data_dir):
infile = os.path.join(test_data_dir, file)
featureset = get_featureset(infile)
label_set = set(feature for feature in featureset)
print "\n\nProcessing test file", file
max_value = 0.0
max_value_label = ''
file_score = {}
#-----------------------
# For each test file, we examine the overlap of the n-grams between the test file/document
# and each of the the labeled datasets (training data) using Jacardian Similarity
# This gives us a file_score dictionary where the key is the label and the value is the
# Jacardian Similarity overlap. We then select the key (or label) with the max value as the
# label prediction for the test file.
# Note that if we do not use "1.0" as the multiplier below, the file_score[key] will be zero
# as all other numeric values are integers and the result for file_score[key] is also computed as integer.
#-----------------------
for key in training_data:
file_score[key] = len(training_data[key].intersection(label_set))*1.0/(len(training_data[key].union(label_set)))
if file_score[key] > max_value:
max_value_label = key
max_value = file_score[key]
print "\tLabel for", file, "is", max_value_label, "with score of", max_value

print "\n\n**********************************************************************"
#-----------------------------------------------

Not So Random Stuff on Big Data

Wednesday, February 13, 2013

Machine Learning: Natural Language Processing Using NLTK - Part 2

No comments:

Post a Comment