Wednesday, February 13, 2013

Machine Learning: Natural Language Processing Using NLTK - Part 2

So in the previous post, I explored the use of nltk python module and the use of Naive Bayes classifier to classify or predict documents. The results of the classification were quite good - with 11 out of 12 documents being correctly classified. The incorrectly classified document was from George Bush Jr., and it was classified as being from Harry Truman. Naive Bayes used conditional probability to arrive at its result, using the probabilities of the different n-grams occurring in the significant set of n-grams of each of the Presidents. 

Based on casual observations, I had noticed that the documents from Harry Truman were exceptionally long and the feature set on n-grams was consequently quite large, with a significant overlap. George Bush Jr's documents on the other had were not so long and had a variety of n-grams across the documents. So I thought of taking a different approach - that of using Jaccard similarity of the n-grams (see Mining of Massive Datasets). 

And apparently that turned out to be a better approach - and got us 12 out of 12 correct classification of the documents. The program is similar to the previous one with a very small change. 

Python Code

#-----------------------------------------------
# Import pre-amble
#-----------------------------------------------
import sys
import os
import re
import pickle
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
#from nltk.classify import NaiveBayesClassifier
#-----------------------------------------------

#-----------------------------------------------
def print_script_usage():
    ''' Prints usage of this script.

    Script requires two parameters.
    First parameter (referred to as training_data_dir)
    is a top-level directory name that contains 
    sub-directories of documents for "training/learning".
    The name of each sub-directory is considered a
    "label" (classification) and all documents 
    under that sub-directory are classified 
    as having that label (or classification).

    The second parameter is a directory that
    contains test documents that need to be
    classified. 
    '''
    print "\n\nUsage: " + script_name + " <input_train_data_dir> <test_data_dir>\n\n"
#-----------------------------------------------

#-----------------------------------------------
def validate_dir(dir_name):
    ''' Checks if input dir is a valid an existing directory.'''
    if not os.path.isdir(dir_name):
        print dir_name + " is not a directory."
        print_script_usage()
        sys.exit(1)
#-----------------------------------------------

#-----------------------------------------------
def get_featureset(file, single_words=False, bigrams=True, trigrams=True):
    '''Function to extract featureset from a file.

    This is the main function. It takes a file
    as input parameter and generates a featureset for
    the file. 
    The featureset consists of a set (dictionary) of
    key/value pairs where the key is a single word, 
    bigram or trigram from the text in the file and 
    the value is the boolean value "True".
    '''
    #-----------------------------------------------
    # Word tokenization of input file
    #-----------------------------------------------
    try:
        f = open(file)
        all_text = f.readlines()
        f.close()
    except:
        print "Error in opening file " + file + "."
        print sys.exc_info()
    all_text = " ".join(all_text).lower()
    wp_tokenizer = WordPunctTokenizer()
    tokens = wp_tokenizer.tokenize(all_text)
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in english_stopwords and len(token) > 4]
    total_word_tokens = len(filtered_tokens)
    n_gram_limit = int(total_word_tokens*0.1)
    #-----------------------------------------------
    # Bigrams from word tokens
    #-----------------------------------------------
    if bigrams == True:
        bigram_finder = BigramCollocationFinder.from_words(filtered_tokens, 5)
        bigrams_chi_sq = bigram_finder.nbest(BigramAssocMeasures.chi_sq, n_gram_limit)
        bigrams_raw_freq = bigram_finder.nbest(BigramAssocMeasures.raw_freq, n_gram_limit)
        bigrams_likelihood_ratio = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, n_gram_limit)
        bigrams_poisson_stirling = bigram_finder.nbest(BigramAssocMeasures.poisson_stirling, n_gram_limit)
    else:
        bigrams_chi_sq = []
        bigrams_raw_freq = []
        bigrams_likelihood_ratio = []
        bigrams_poisson_stirling = []
    #-----------------------------------------------
    # Trigrams from word tokens
    #-----------------------------------------------
    if trigrams == True:
        trigram_finder = TrigramCollocationFinder.from_words(filtered_tokens)
        trigrams_chi_sq = trigram_finder.nbest(TrigramAssocMeasures.chi_sq, n_gram_limit)
        trigrams_raw_freq = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, n_gram_limit)
        trigrams_likelihood_ratio = trigram_finder.nbest(TrigramAssocMeasures.likelihood_ratio, n_gram_limit)
        trigrams_poisson_stirling = trigram_finder.nbest(TrigramAssocMeasures.poisson_stirling, n_gram_limit)
    else:
        trigrams_chi_sq = []
        trigrams_raw_freq = []
        trigrams_likelihood_ratio = []
        trigrams_poisson_stirling = []
    #-----------------------------------------------
    # Consolidated list of words, bigrams and trigrams
    #-----------------------------------------------
    if single_words == False:
        filtered_tokens = []
    
    all_tokens = list(set(filtered_tokens + bigrams_chi_sq + bigrams_raw_freq + bigrams_likelihood_ratio + bigrams_poisson_stirling + trigrams_chi_sq + trigrams_raw_freq + trigrams_likelihood_ratio + trigrams_poisson_stirling))
    #-----------------------------------------------
    # Featureset for the input file
    # The featureset consists of a key-value pair
    # where key is the word, bigram or trigram and 
    # the value is True. 
    #-----------------------------------------------
    return dict([(token, True) for token in all_tokens if token != tuple()])
#-----------------------------------------------


#-----------------------------------------------
script_name = os.path.basename(sys.argv[0]) or 'this_script'

training_data_dir, test_data_dir = 'C:\\Users\\jthakrar\\Jayesh\\Reference\\Machine-Learning\\Presidential-Speeches\\economic_reports\\Train', 'C:\\Users\\jthakrar\\Jayesh\\Reference\\Machine-Learning\\Presidential-Speeches\\economic_reports\\Test'
training_data_dir, test_data_dir = sys.argv[1], sys.argv[2]
validate_dir(training_data_dir)
validate_dir(test_data_dir)
#-----------------------------------------------


#-----------------------------------------------
print "\n\n\n\n**********************************************************************"
print "\t\t\tT R A I N I N G"
print "**********************************************************************"
print "\n\nProcessing training data from:", training_data_dir

labels = [dir_name for dir_name in os.listdir(training_data_dir) if os.path.join(training_data_dir, dir_name)]

training_featureset = []

for label in labels:
    dir = os.path.join(training_data_dir, label)
    if not os.path.isdir(dir):
        print "\tUnexpected error in resolving directory - " + dir
        sys.exit(1)
    print "\nProcessing sub-directory: " + label 
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        if not os.path.isfile(file_path):
            print "\tfile " + file + "(" + file_path + ") does not seem to be a file"
            continue
        featureset = get_featureset(file_path)
        print "\tCompleted file: " + file
        #print "file = " + file + "  label = " + label + " featureset length = " + str(len(featureset))
        labeled_featureset = tuple([featureset, label])
        training_featureset.append(labeled_featureset)

#-----------------------
# This is where we deviate from the NLTK approach Instead of using the Naive Bayes Algorithm,
# we combine the n-grams for the same label across different documents to get a single n-gram set per label.
# This becomes the "training_data" set. We will apply the Jacardina similarity test to each test document to predict
# the label for the test document.
#-----------------------
training_data = {}
for featureset, label in training_featureset:
    label_set = set(feature for feature in featureset)
    #-----------------------
    # Add each training document's tokens to the training data dictionary.
    # Note that we need to "append" the new training document's tokens to the label's existing tokenset
    #-----------------------
    if label in training_data:
        training_data[label] = training_data[label].union(label_set)
    else:
        training_data[label] = label_set

#-----------------------------------------------

#-----------------------------------------------
print "\n\n\n\n**********************************************************************"
print "\t\t\tT E S T I N G"
print "**********************************************************************"
print "\nProcessing test data from:", test_data_dir
for file in os.listdir(test_data_dir):
    infile = os.path.join(test_data_dir, file)
    featureset = get_featureset(infile)
    label_set = set(feature for feature in featureset)
    print "\n\nProcessing test file", file
    max_value = 0.0
    max_value_label = ''
    file_score = {}
    #-----------------------
    # For each test file, we examine the overlap of the n-grams between the test file/document
    # and each of the the labeled datasets (training data) using Jacardian Similarity 
    # This gives us a file_score dictionary where the key is the label and the value is the 
    # Jacardian Similarity overlap. We then select the key (or label) with the max value as the
    # label prediction for the test file.
    # Note that if we do not use "1.0" as the multiplier below, the file_score[key] will be zero
    # as all other numeric values are integers and the result for file_score[key] is also computed as integer.
    #-----------------------
    for key in training_data:
        file_score[key] = len(training_data[key].intersection(label_set))*1.0/(len(training_data[key].union(label_set)))
        if file_score[key] > max_value:
            max_value_label = key
            max_value = file_score[key]  
    print "\tLabel for", file, "is", max_value_label, "with score of", max_value

print "\n\n**********************************************************************"
#-----------------------------------------------






No comments:

Post a Comment