Example usage for org.apache.lucene.benchmark.byTask.feeds TrecContentSource getNextDocData

List of usage examples for org.apache.lucene.benchmark.byTask.feeds TrecContentSource getNextDocData

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds TrecContentSource getNextDocData.

Prototype

@Override
    public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException 

Source Link

Usage

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;/*from   w ww  . jav a  2 s. c  o m*/
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}