com.spike.text.lucene.util.LuceneTestBookIndexingUtil.java Source code

Introduction

Here is the source code for com.spike.text.lucene.util.LuceneTestBookIndexingUtil.java
Source

package com.spike.text.lucene.util;

/**
 * Copyright Manning Publications Co. Licensed under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable
 * law or agreed to in writing, software distributed under the License is distributed on an "AS IS"
 * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License
 * for the specific lan
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Properties;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.spike.text.lucene.util.anno.LuceneInAction2ndBook;

/**
 * #1 Get category <br/>
 * #2 Pull fields <br/>
 * #3 Add fields to Document instance <br/>
 * #4 Flag subject field <br/>
 * #5 Add catch-all contents field <br/>
 * #6 Custom analyzer to override multi-valued position increment
 */
@LuceneInAction2ndBook
public class LuceneTestBookIndexingUtil {

    public static void main(String[] args) throws IOException {
        String dataDir = LuceneAppConstants.BOOK_DATA_DIR;
        String indexDir = LuceneAppConstants.BOOK_INDEX_DIR;

        List<File> results = new ArrayList<File>();
        findFiles(results, new File(dataDir));
        System.out.println(results.size() + " books to index");
        Directory directory = FSDirectory.open(Paths.get(indexDir));

        IndexWriterConfig config = new IndexWriterConfig(new MyStandardAnalyzer());
        config.setCommitOnClose(true);
        IndexWriter indexWriter = new IndexWriter(directory, config);

        for (File file : results) {
            Document document = getDocument(dataDir, file);
            indexWriter.addDocument(document);
        }

        indexWriter.close();
        directory.close();
    }

    /**
     * field names used for indexing
     * @author zhoujiagen<br/>
     *         Sep 3, 2015 7:23:22 PM
     */
    public static interface FIELD_NAMES {
        String isbn = "isbn";
        String title = "title";
        String author = "author";
        String url = "url";
        String subject = "subject";
        String pubmonth = "pubmonth";
        String category = "category";
        String title2 = "title2";
        String pubmonthAsDay = "pubmonthAsDay";
        String contents = "contents";
    }

    public static Document getDocument(String rootDir, File file) throws IOException {
        Properties props = new Properties();
        props.load(new FileInputStream(file));

        Document doc = new Document();

        // category comes from relative path below the base directory
        String category = file.getParent().substring(rootDir.length());
        category = category.replace(File.separatorChar, '/');

        String isbn = props.getProperty("isbn");
        String title = props.getProperty("title");
        String author = props.getProperty("author");
        String url = props.getProperty("url");
        String subject = props.getProperty("subject");

        String pubmonth = props.getProperty("pubmonth");

        System.out.println(
                title + "\n" + author + "\n" + subject + "\n" + pubmonth + "\n" + category + "\n---------");

        // doc.add(new Field("isbn", isbn, Field.Store.YES,
        // Field.Index.NOT_ANALYZED));
        doc.add(LuceneAppUtil.createStringField("isbn", isbn, Store.YES, false, IndexOptions.DOCS, false));

        // doc.add(new Field("category", category, Field.Store.YES,
        // Field.Index.NOT_ANALYZED));
        doc.add(LuceneAppUtil.createStringField("category", category, Store.YES, false, IndexOptions.DOCS, false));

        // doc.add(new Field("title", title, Field.Store.YES,
        // Field.Index.ANALYZED,
        // Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(LuceneAppUtil.createStringField("title", title, Store.YES, true,
                IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, false));

        // doc.add(new Field("title2", title.toLowerCase(), Field.Store.YES,
        // Field.Index.NOT_ANALYZED_NO_NORMS,
        // Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(LuceneAppUtil.createStringField("title2", title.toLowerCase(), Store.YES, false,
                IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, true));

        // split multiple authors into unique field instances
        String[] authors = author.split(",");
        for (String a : authors) {
            // doc.add(new Field("author", a, Field.Store.YES,
            // Field.Index.NOT_ANALYZED,
            // Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(LuceneAppUtil.createStringField("author", a, Store.YES, false,
                    IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, false));
        }

        // doc.add(new Field("url", url, Field.Store.YES,
        // Field.Index.NOT_ANALYZED_NO_NORMS));
        doc.add(LuceneAppUtil.createStringField("url", url, Store.YES, false, IndexOptions.DOCS, true));
        // doc.add(new Field("subject", subject, Field.Store.YES,
        // Field.Index.ANALYZED,
        // Field.TermVector.WITH_POSITIONS_OFFSETS));
        doc.add(LuceneAppUtil.createStringField("subject", subject, Store.YES, true,
                IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, true));

        Field pubmonthField = new IntField("pubmonth", Integer.parseInt(pubmonth), Store.YES);
        doc.add(pubmonthField);

        Date d;
        try {
            d = DateTools.stringToDate(pubmonth);
        } catch (ParseException pe) {
            throw new RuntimeException(pe);
        }

        Field pubmonthAsDayField = new IntField("pubmonthAsDay", (int) (d.getTime() / (1000 * 3600 * 24)),
                Store.YES);
        doc.add(pubmonthAsDayField);

        for (String text : new String[] { title, subject, author, category }) {
            // doc.add(new Field("contents", text, Field.Store.NO,
            // Field.Index.ANALYZED,
            // Field.TermVector.WITH_POSITIONS_OFFSETS));
            doc.add(LuceneAppUtil.createStringField("contents", text, Store.NO, true,
                    IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, false));

        }

        return doc;
    }

    /**
     * A Recursive method to find the dedicated files in directory
     * @param result
     * @param dir
     */
    private static void findFiles(List<File> result, File dir) {
        for (File file : dir.listFiles()) {
            if (file.getName().endsWith(".properties")) {
                result.add(file);
            } else if (file.isDirectory()) {
                findFiles(result, file);
            }
        }
    }

    private static class MyStandardAnalyzer extends Analyzer {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            StandardTokenizer tokenizer = new StandardTokenizer();
            PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class);
            if ("contents".equals(fieldName)) {
                posIncrAtt.setPositionIncrement(100);
            } else {
                posIncrAtt.setPositionIncrement(0);
            }

            TokenStreamComponents result = new TokenStreamComponents(tokenizer);

            return result;
        }
    }

}