com.mathworks.xzheng.advsearching.CategorizerTest.java Source code

Java tutorial

Introduction

Here is the source code for com.mathworks.xzheng.advsearching.CategorizerTest.java

Source

package com.mathworks.xzheng.advsearching;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
 */

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

import junit.framework.TestCase;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;

import com.mathworks.xzheng.common.TestUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.util.BytesRef;

// From chapter 5
public class CategorizerTest extends TestCase {
    Map categoryMap;

    protected void setUp() throws Exception {
        categoryMap = new TreeMap();

        buildCategoryVectors();
        //    dumpCategoryVectors();
    }

    public void testCategorization() throws Exception {
        assertEquals("/technology/computers/programming/methodology", getCategory("extreme agile methodology"));
        assertEquals("/education/pedagogy", getCategory("montessori education philosophy"));
    }

    private void dumpCategoryVectors() {
        Iterator categoryIterator = categoryMap.keySet().iterator();
        while (categoryIterator.hasNext()) {
            String category = (String) categoryIterator.next();
            System.out.println("Category " + category);

            Map vectorMap = (Map) categoryMap.get(category);
            Iterator vectorIterator = vectorMap.keySet().iterator();
            while (vectorIterator.hasNext()) {
                String term = (String) vectorIterator.next();
                System.out.println("    " + term + " = " + vectorMap.get(term));
            }
        }
    }

    private void buildCategoryVectors() throws IOException {
        IndexReader reader = IndexReader.open(TestUtil.getBookIndexDirectory());

        int maxDoc = reader.maxDoc();

        for (int i = 0; i < maxDoc; i++) {
            if (!reader.document(i)) {
                Document doc = reader.document(i);
                String category = doc.get("category");

                Map vectorMap = (Map) categoryMap.get(category);
                if (vectorMap == null) {
                    vectorMap = new TreeMap();
                    categoryMap.put(category, vectorMap);
                }

                Terms terms = reader.getTermVector(i, "subject");

                addTermFreqToMap(vectorMap, terms);
            }
        }
    }

    private void addTermFreqToMap(Map vectorMap, Terms termss) {

        TermsEnum termsEnum = termss.iterator(null);
        String[] terms = termsEnum.getterms();
        int[] freqs = termss.getTermFrequencies();

        for (int i = 0; i < terms.length; i++) {
            String term = terms[i];

            if (vectorMap.containsKey(term)) {
                Integer value = (Integer) vectorMap.get(term);
                vectorMap.put(term, new Integer(value.intValue() + freqs[i]));
            } else {
                vectorMap.put(term, new Integer(freqs[i]));
            }
        }
    }

    private String getCategory(String subject) {
        String[] words = subject.split(" ");

        Iterator categoryIterator = categoryMap.keySet().iterator();
        double bestAngle = Double.MAX_VALUE;
        String bestCategory = null;

        while (categoryIterator.hasNext()) {
            String category = (String) categoryIterator.next();
            //      System.out.println(category);

            double angle = computeAngle(words, category);
            //      System.out.println(" -> angle = " + angle + " (" + Math.toDegrees(angle) + ")");
            if (angle < bestAngle) {
                bestAngle = angle;
                bestCategory = category;
            }
        }

        return bestCategory;
    }

    private double computeAngle(String[] words, String category) {
        Map vectorMap = (Map) categoryMap.get(category);

        int dotProduct = 0;
        int sumOfSquares = 0;
        for (String word : words) {
            int categoryWordFreq = 0;

            if (vectorMap.containsKey(word)) {
                categoryWordFreq = ((Integer) vectorMap.get(word)).intValue();
            }

            dotProduct += categoryWordFreq; //#1
            sumOfSquares += categoryWordFreq * categoryWordFreq;
        }

        double denominator;
        if (sumOfSquares == words.length) {
            denominator = sumOfSquares; // #2
        } else {
            denominator = Math.sqrt(sumOfSquares) * Math.sqrt(words.length);
        }

        double ratio = dotProduct / denominator;

        return Math.acos(ratio);
    }
    /*
      #1 Assume each word has frequency 1
      #2 Shortcut to prevent precision issue
    */
}