com.tamingtext.qa.WikipediaWexIndexer.java Source code

Java tutorial

Introduction

Here is the source code for com.tamingtext.qa.WikipediaWexIndexer.java

Source

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */

package com.tamingtext.qa;

import java.io.File;
import java.io.FilenameFilter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WikipediaWexIndexer {
    private transient static Logger log = LoggerFactory.getLogger(WikipediaWexIndexer.class);

    private SolrServer server;
    public static final String DEFAULT_SOLR_URL = "http://localhost:8983/solr";

    public WikipediaWexIndexer() throws MalformedURLException {
        this.server = new CommonsHttpSolrServer(DEFAULT_SOLR_URL);
    }

    public WikipediaWexIndexer(SolrServer server) throws MalformedURLException {
        this.server = server;
    }

    public int index(File wikipediaWEX) throws Exception {
        return index(wikipediaWEX, Integer.MAX_VALUE, 1000);
    }

    public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception {
        int result = 0;
        if (wikipediaWEX != null && wikipediaWEX.isFile()) {
            WexWikiContentSource contentSource = new WexWikiContentSource();
            Properties properties = new Properties();
            // fileName = config.get("docs.file", null);
            String filePath = wikipediaWEX.getAbsolutePath();
            properties.setProperty("docs.file", filePath);
            properties.setProperty("doc.maker.forever", "false");
            contentSource.setConfig(new Config(properties));
            contentSource.resetInputs();
            // docMaker.openFile();
            List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
            int i = 0;
            SolrInputDocument sDoc = null;
            long start = System.currentTimeMillis();
            try {
                DocData docData = new DocData();

                while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                    int mod = i % batchSize;

                    sDoc = new SolrInputDocument();
                    docs.add(sDoc);
                    sDoc.addField("file", filePath + "_" + i);

                    sDoc.addField("docid", String.valueOf(docData.getID()));
                    sDoc.addField("body", docData.getBody());
                    sDoc.addField("doctitle", docData.getTitle());
                    sDoc.addField("name_s", docData.getName());

                    String[] categories = docData.getProps().getProperty("category").split(";;");

                    for (String c : categories) {
                        sDoc.addField("category", c);
                    }

                    if (mod == batchSize - 1) {
                        log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                        server.add(docs);
                        docs.clear();
                    }
                    i++;
                }
            } catch (NoMoreDataException e) {

            }
            long finish = System.currentTimeMillis();
            if (log.isInfoEnabled()) {
                log.info("Indexing took " + (finish - start) + " ms");
            }
            if (docs.size() > 0) {
                server.add(docs);
            }
            result = i + docs.size();
            server.commit();
            server.optimize();
        } else {
            System.out.println("Can't find file: " + wikipediaWEX);
        }
        return result;
    }

    public static void main(String[] args) throws Exception {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();

        Option wikipediaFileOpt = obuilder.withLongName("wikiFile").withRequired(true)
                .withArgument(abuilder.withName("wikiFile").withMinimum(1).withMaximum(1).create())
                .withDescription("The path to the wikipedia dump file. "
                        + "May be a directory containing wikipedia dump files. "
                        + "If a directory is specified, files starting with the prefix "
                        + "freebase-segment- are used.")
                .withShortName("w").create();

        Option numDocsOpt = obuilder.withLongName("numDocs").withRequired(false)
                .withArgument(abuilder.withName("numDocs").withMinimum(1).withMaximum(1).create())
                .withDescription("The number of docs to index").withShortName("n").create();

        Option solrURLOpt = obuilder.withLongName("solrURL").withRequired(false)
                .withArgument(abuilder.withName("solrURL").withMinimum(1).withMaximum(1).create())
                .withDescription("The URL where Solr lives").withShortName("s").create();

        Option solrBatchOpt = obuilder.withLongName("batch").withRequired(false)
                .withArgument(abuilder.withName("batch").withMinimum(1).withMaximum(1).create())
                .withDescription("The number of docs to include in each indexing batch").withShortName("b")
                .create();

        Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
                .create();

        Group group = gbuilder.withName("Options").withOption(wikipediaFileOpt).withOption(numDocsOpt)
                .withOption(solrURLOpt).withOption(solrBatchOpt).withOption(helpOpt).create();

        Parser parser = new Parser();
        parser.setGroup(group);

        try {
            CommandLine cmdLine = parser.parse(args);

            if (cmdLine.hasOption(helpOpt)) {
                CommandLineUtil.printHelp(group);
                return;
            }

            File file;
            file = new File(cmdLine.getValue(wikipediaFileOpt).toString());
            File[] dumpFiles;
            if (file.isDirectory()) {
                dumpFiles = file.listFiles(new FilenameFilter() {
                    public boolean accept(File file, String s) {
                        return s.startsWith("freebase-segment-");
                    }
                });
            } else {
                dumpFiles = new File[] { file };
            }

            int numDocs = Integer.MAX_VALUE;
            if (cmdLine.hasOption(numDocsOpt)) {
                numDocs = Integer.parseInt(cmdLine.getValue(numDocsOpt).toString());
            }
            String url = DEFAULT_SOLR_URL;
            if (cmdLine.hasOption(solrURLOpt)) {
                url = cmdLine.getValue(solrURLOpt).toString();
            }
            int batch = 100;
            if (cmdLine.hasOption(solrBatchOpt)) {
                batch = Integer.parseInt(cmdLine.getValue(solrBatchOpt).toString());
            }
            WikipediaWexIndexer indexer = new WikipediaWexIndexer(new CommonsHttpSolrServer(url));
            int total = 0;
            for (int i = 0; i < dumpFiles.length && total < numDocs; i++) {
                File dumpFile = dumpFiles[i];
                log.info("Indexing: " + file + " Num files to index: " + (numDocs - total));
                long start = System.currentTimeMillis();
                int totalFile = indexer.index(dumpFile, numDocs - total, batch);
                long finish = System.currentTimeMillis();
                if (log.isInfoEnabled()) {
                    log.info("Indexing " + dumpFile + " took " + (finish - start) + " ms");
                }
                total += totalFile;
                log.info("Done Indexing: " + file + ". Indexed " + totalFile + " docs for that file and " + total
                        + " overall.");

            }
            log.info("Indexed " + total + " docs overall.");
        } catch (OptionException e) {
            log.error("Exception", e);
            CommandLineUtil.printHelp(group);
            return;
        }
    }
}