it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java Source code

Introduction

Here is the source code for it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java
Source

package it.unipd.dei.ims.lucene.clef.applications;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import it.unipd.dei.ims.lucene.clef.AnalyzerFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Properties;

/**
 *
 * Functionalities to index CLEF test collections with Apache Lucene.
 *
 */
public class BuildIndex {

    static org.slf4j.Logger logger = LoggerFactory.getLogger(BuildIndex.class);

    public static String ID_FIELD_NAME = "ID";
    public static String BODY_FIELD_NAME = "BODY";

    public static void main(String[] args) {

        Properties properties = new Properties();
        InputStream input = null;
        try {
            if (System.getProperty("properties.path") != null) {
                input = new FileInputStream(System.getProperty("properties.path"));
                properties.load(input);
            } else {
                logger.info("Loading default property file [resources/lucene-clef.properties]");
                ClassLoader loader = Thread.currentThread().getContextClassLoader();
                input = loader.getResourceAsStream("lucene-clef.properties");
                properties.load(input);
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        } finally {
            if (input != null) {
                try {
                    input.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

        properties.putAll(System.getProperties());

        String language = properties.getProperty("language");

        String stemmer = properties.getProperty("stemmer");

        String stopsetType = properties.getProperty("stopset.type");

        String stopsetPath = null;
        if (stopsetType.equalsIgnoreCase("CUSTOM")) {
            stopsetPath = properties.getProperty("stopset.path");
        }

        String corporaRootPath = properties.getProperty("corpora.path");

        int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

        String[] corpora = properties.getProperty(language + ".corpora").split(";");

        TrecContentSource trecContentSource = new TrecContentSource();

        try {

            Properties configProps = new Properties();
            configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
            configProps.setProperty("content.source.verbose", "false");
            configProps.setProperty("content.source.forever", "false");
            configProps.setProperty("content.source.excludeIteration", "true");
            configProps.setProperty("work.dir", new File(".").getAbsolutePath());
            configProps.setProperty("language", language);
            configProps.setProperty("stemmer", stemmer);
            configProps.setProperty("stopset_type", stopsetType);
            configProps.setProperty("stopset_path", stopsetPath);

            // set lucene index directory
            Path indexPath = new File(properties.getProperty("index.path")).toPath();
            Directory directory = new SimpleFSDirectory(indexPath);

            // indexing configuration

            CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

            Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

            IndexWriterConfig conf = new IndexWriterConfig(analyzer);
            conf.setSimilarity(new BM25Similarity());
            conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

            IndexWriter indexWriter = new IndexWriter(directory, conf);
            boolean storePositions = true;
            FieldType bodyFieldType = new FieldType();
            if (storePositions) {
                bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
            } else {
                bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
            }

            for (String corpus : corpora) {

                int docCount = 0;

                logger.info("... indexing corpus " + corpus);

                try {

                    configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                    configProps.setProperty("content.source.encoding",
                            properties.getProperty(corpus + ".encoding", "UTF-8"));

                    trecContentSource.setConfig(new Config(configProps));

                    DocData docData = new DocData();
                    while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                        docCount++;
                        //                    System.out.println("ID: "+docData.getName());
                        //                    System.out.println("BODY: "+docData.getBody());
                        Document doc = getDocumentFromDocData(docData, bodyFieldType);
                        indexWriter.addDocument(doc);
                    }

                } catch (NoMoreDataException e) {
                    logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
                }

            }

            indexWriter.close();

            DirectoryReader ireader = DirectoryReader.open(directory);
            if (corpusSize != ireader.numDocs()) {
                throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                        + corpusSize);
            }
            logger.info("Number of documents: " + ireader.numDocs());

        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    public static Document getDocumentFromDocData(DocData docData, FieldType bodyFieldType) {

        Document doc = new Document();

        // add identifier field
        doc.add(new StringField(BuildIndex.ID_FIELD_NAME, docData.getName(), Field.Store.YES));

        // add body field
        doc.add(new Field(BuildIndex.BODY_FIELD_NAME, docData.getBody(), bodyFieldType));

        return doc;

    }

}