com.leavesfly.lia.commom.CreateTestIndex.java Source code

Introduction

Here is the source code for com.leavesfly.lia.commom.CreateTestIndex.java
Source

package com.leavesfly.lia.commom;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
*/

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;
import java.util.Date;
import java.util.List;
import java.util.ArrayList;
import java.text.ParseException;

public class CreateTestIndex {

    public static Document getDocument(String rootDir, File file) throws IOException {
        Properties props = new Properties();
        props.load(new FileInputStream(file));

        Document doc = new Document();

        // category comes from relative path below the base directory
        String category = file.getParent().substring(rootDir.length()); //1
        category = category.replace(File.separatorChar, '/'); //1

        String isbn = props.getProperty("isbn"); //2
        String title = props.getProperty("title"); //2
        String author = props.getProperty("author"); //2
        String url = props.getProperty("url"); //2
        String subject = props.getProperty("subject"); //2

        String pubmonth = props.getProperty("pubmonth"); //2

        System.out.println(
                title + "\n" + author + "\n" + subject + "\n" + pubmonth + "\n" + category + "\n---------");

        doc.add(new Field("isbn", // 3
                isbn, // 3
                Field.Store.YES, // 3
                Field.Index.NOT_ANALYZED)); // 3
        doc.add(new Field("category", // 3
                category, // 3
                Field.Store.YES, // 3
                Field.Index.NOT_ANALYZED)); // 3
        doc.add(new Field("title", // 3
                title, // 3
                Field.Store.YES, // 3
                Field.Index.ANALYZED, // 3
                Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3
        doc.add(new Field("title2", // 3
                title.toLowerCase(), // 3
                Field.Store.YES, // 3
                Field.Index.NOT_ANALYZED_NO_NORMS, // 3
                Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3

        // split multiple authors into unique field instances
        String[] authors = author.split(","); // 3
        for (String a : authors) { // 3
            doc.add(new Field("author", // 3
                    a, // 3
                    Field.Store.YES, // 3
                    Field.Index.NOT_ANALYZED, // 3
                    Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3
        }

        doc.add(new Field("url", // 3
                url, // 3
                Field.Store.YES, // 3
                Field.Index.NOT_ANALYZED_NO_NORMS)); // 3
        doc.add(new Field("subject", // 3  //4
                subject, // 3  //4
                Field.Store.YES, // 3  //4
                Field.Index.ANALYZED, // 3  //4
                Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3  //4

        doc.add(new NumericField("pubmonth", // 3
                Field.Store.YES, // 3
                true).setIntValue(Integer.parseInt(pubmonth))); // 3

        Date d; // 3
        try { // 3
            d = DateTools.stringToDate(pubmonth); // 3
        } catch (ParseException pe) { // 3
            throw new RuntimeException(pe); // 3
        } // 3
        doc.add(new NumericField("pubmonthAsDay") // 3
                .setIntValue((int) (d.getTime() / (1000 * 3600 * 24)))); // 3

        for (String text : new String[] { title, subject, author, category }) { // 3 // 5
            doc.add(new Field("contents", text, // 3 // 5
                    Field.Store.NO, Field.Index.ANALYZED, // 3 // 5
                    Field.TermVector.WITH_POSITIONS_OFFSETS)); // 3 // 5
        }

        return doc;
    }

    private static String aggregate(String[] strings) {
        StringBuilder buffer = new StringBuilder();

        for (int i = 0; i < strings.length; i++) {
            buffer.append(strings[i]);
            buffer.append(" ");
        }

        return buffer.toString();
    }

    private static void findFiles(List<File> result, File dir) {
        for (File file : dir.listFiles()) {
            if (file.getName().endsWith(".properties")) {
                result.add(file);
            } else if (file.isDirectory()) {
                findFiles(result, file);
            }
        }
    }

    private static class MyStandardAnalyzer extends StandardAnalyzer { // 6
        public MyStandardAnalyzer(Version matchVersion) { // 6
            super(matchVersion); // 6
        } // 6

        public int getPositionIncrementGap(String field) { // 6
            if (field.equals("contents")) { // 6
                return 100; // 6
            } else { // 6
                return 0; // 6
            }
        }
    }

    public static void main(String[] args) throws IOException {
        String dataDir = args[0];
        String indexDir = args[1];
        List<File> results = new ArrayList<File>();
        findFiles(results, new File(dataDir));
        System.out.println(results.size() + " books to index");
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexWriter w = new IndexWriter(dir, new MyStandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);
        for (File file : results) {
            Document doc = getDocument(dataDir, file);
            w.addDocument(doc);
        }
        w.close();
        dir.close();
    }
}

/*
  #1 Get category
  #2 Pull fields
  #3 Add fields to Document instance
  #4 Flag subject field
  #5 Add catch-all contents field
  #6 Custom analyzer to override multi-valued position increment
*/