edu.virginia.cs.index.AnswerIndexer.java Source code

Introduction

Here is the source code for edu.virginia.cs.index.AnswerIndexer.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package edu.virginia.cs.index;

import edu.virginia.cs.utility.SpecialAnalyzer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 *
 * @author Wasi
 */
public class AnswerIndexer {

    /**
     * Creates the initial index files on disk
     *
     * @param indexPath
     * @return
     * @throws IOException
     */
    private static IndexWriter setupIndex(String indexPath) throws IOException {
        Analyzer analyzer = new SpecialAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        config.setRAMBufferSizeMB(2048.0);

        FSDirectory dir = FSDirectory.open(new File(indexPath));
        IndexWriter writer = new IndexWriter(dir, config);

        return writer;
    }

    /**
     * @param indexPath Where to create the index
     * @param prefix The prefix of all the paths in the fileList
     * @param fileList Each line is a path to a document
     */
    public static void index(String indexPath, String prefix, String fileList) throws IOException {
        String line;
        System.out.println("Creating Lucene index...");
        FieldType _contentFieldType = new FieldType();
        _contentFieldType.setIndexed(true);
        _contentFieldType.setStored(true);

        FieldType _FieldType = new FieldType();
        _FieldType.setIndexed(false);
        _FieldType.setStored(true);

        IndexWriter writer = setupIndex(indexPath);
        BufferedReader br = new BufferedReader(new FileReader(prefix + fileList));
        int indexed = 0;
        while ((line = br.readLine()) != null) {
            String[] splits = line.split("\t");
            if (Integer.parseInt(splits[1]) == 2) {
                Document doc = new Document();
                doc.add(new Field("id", splits[0], _contentFieldType));
                doc.add(new Field("parentId", splits[2], _FieldType));
                doc.add(new Field("acceptedAnswerId", splits[3], _FieldType));
                doc.add(new Field("creationDate", splits[4], _FieldType));
                doc.add(new Field("score", splits[5], _FieldType));
                doc.add(new Field("viewCount", splits[6], _FieldType));
                doc.add(new Field("body", splits[7], _contentFieldType));
                doc.add(new Field("code", splits[8], _contentFieldType));
                doc.add(new Field("ownerId", splits[9], _FieldType));
                doc.add(new Field("title", splits[10], _FieldType));
                doc.add(new Field("tags", splits[11], _FieldType));
                doc.add(new Field("answerCount", splits[12], _FieldType));
                doc.add(new Field("commentCount", splits[13], _FieldType));
                doc.add(new Field("favoriteCount", splits[14], _FieldType));
                writer.addDocument(doc);

                ++indexed;
                if (indexed % 100 == 0) {
                    System.out.println(" -> indexed " + indexed + " docs...");
                }
            }
        }
        System.out.println(" -> indexed " + indexed + " total docs.");
        br.close();
        writer.close();
    }
}