lia.chapter5.CategorizerTest.java Source code

Java tutorial

Introduction

Here is the source code for lia.chapter5.CategorizerTest.java

Source

package lia.chapter5;

/**
 * Copyright Manning Publications Co.
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan
 */

import junit.framework.TestCase;
import lia.common.Utils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

// From chapter 5
public class CategorizerTest extends TestCase {
    Map categoryMap;

    protected void setUp() throws Exception {
        categoryMap = new TreeMap();

        buildCategoryVectors();
        //dumpCategoryVectors();
    }

    public void testCategorization() throws Exception {
        assertEquals("technology/computers/programming/methodology", getCategory("extreme agile methodology"));
        assertEquals("education/pedagogy", getCategory("montessori education philosophy"));
    }

    private void dumpCategoryVectors() {
        Iterator categoryIterator = categoryMap.keySet().iterator();
        while (categoryIterator.hasNext()) {
            String category = (String) categoryIterator.next();
            System.out.println("Category " + category);

            Map vectorMap = (Map) categoryMap.get(category);
            Iterator vectorIterator = vectorMap.keySet().iterator();
            while (vectorIterator.hasNext()) {
                String term = (String) vectorIterator.next();
                System.out.println("    " + term + " = " + vectorMap.get(term));
            }
        }
    }

    private void buildCategoryVectors() throws IOException {
        IndexSearcher searcher = Utils.getBookIndexSearcher();
        IndexReader reader = searcher.getIndexReader();

        int maxDoc = reader.maxDoc();
        System.out.println(maxDoc);
        for (int i = 0; i < maxDoc; i++) {
            Document doc = reader.document(i);
            String category = doc.get("category");
            System.out.println("\n" + doc.get("subject") + "\n");
            Map vectorMap = (Map) categoryMap.get(category);
            if (vectorMap == null) {
                vectorMap = new TreeMap();
                categoryMap.put(category, vectorMap);
            }

            Terms termsVector = reader.getTermVector(i, "subject");

            addTermFreqToMap(vectorMap, termsVector);
        }
    }

    private void addTermFreqToMap(Map vectorMap, Terms termsVector) throws IOException {
        TermsEnum termsEnum = termsVector.iterator();
        BytesRef bytesRef = termsEnum.next();
        while (bytesRef != null) {
            String term = bytesRef.utf8ToString();
            System.out.println(term + " " + termsEnum.totalTermFreq());
            if (vectorMap.containsKey(term)) {
                Long value = (Long) vectorMap.get(term);
                vectorMap.put(term, new Long(value.intValue() + termsEnum.totalTermFreq()));
            } else {
                vectorMap.put(term, new Long(termsEnum.totalTermFreq()));
            }
            bytesRef = termsEnum.next();
        }
        System.out.println();
    }

    private String getCategory(String subject) {
        String[] words = subject.split(" ");

        Iterator categoryIterator = categoryMap.keySet().iterator();
        double bestAngle = Double.MAX_VALUE;
        String bestCategory = null;

        while (categoryIterator.hasNext()) {
            String category = (String) categoryIterator.next();
            //      System.out.println(category);

            double angle = computeAngle(words, category);
            //      System.out.println(" -> angle = " + angle + " (" + Math.toDegrees(angle) + ")");
            if (angle < bestAngle) {
                bestAngle = angle;
                bestCategory = category;
            }
        }

        return bestCategory;
    }

    private double computeAngle(String[] words, String category) {
        Map vectorMap = (Map) categoryMap.get(category);

        int dotProduct = 0;
        int sumOfSquares = 0;
        for (String word : words) {
            long categoryWordFreq = 0;

            if (vectorMap.containsKey(word)) {
                categoryWordFreq = ((Long) vectorMap.get(word));
            }

            dotProduct += categoryWordFreq; //#1
            sumOfSquares += categoryWordFreq * categoryWordFreq;
        }

        double denominator;
        if (sumOfSquares == words.length) {
            denominator = sumOfSquares; // #2
        } else {
            denominator = Math.sqrt(sumOfSquares) * Math.sqrt(words.length);
        }

        double ratio = dotProduct / denominator;

        return Math.acos(ratio);
    }
    /*
      #1 Assume each word has frequency 1
      #2 Shortcut to prevent precision issue
    */
}