Example usage for org.apache.lucene.benchmark.byTask.feeds EnwikiContentSource EnwikiContentSource

List of usage examples for org.apache.lucene.benchmark.byTask.feeds EnwikiContentSource EnwikiContentSource

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds EnwikiContentSource EnwikiContentSource.

Prototype

EnwikiContentSource

Source Link

Usage

From source file:com.datastax.dse.demos.solr.Wikipedia.java

License:Open Source License

public static void indexWikipedia() {

    HttpSolrServer solrClient = null;/*from ww w.  ja  v  a2 s  .c o  m*/
    try {
        Properties p = new Properties();
        p.setProperty("keep.image.only.docs", "false");
        p.setProperty("docs.file", wikifile);

        Config config = new Config(p);

        source = new EnwikiContentSource();
        source.setConfig(config);
        source.resetInputs();
        solrClient = new HttpSolrServer(url);

        if (null != user && null != password) {
            AbstractHttpClient httpClient = (AbstractHttpClient) solrClient.getHttpClient();
            httpClient.addRequestInterceptor(new PreEmptiveBasicAuthenticator(user, password));
        }

        DocData docData = new DocData();
        String firstName = null;
        SolrInputDocument doc = new SolrInputDocument();
        int i = 0;
        for (int x = 0; x < limit; x++) {
            if (i > 0 && i % 1000 == 0)
                System.out.println("Indexed " + i++);

            docData = source.getNextDocData(docData);

            if (firstName == null)
                firstName = docData.getName();
            else if (firstName.equals(docData.getName()))
                break; //looped

            if (addDoc(doc, docData)) {
                solrClient.add(doc);
                i++;
            }
        }
    } catch (NoMoreDataException e) {
    } catch (Exception e) {
        e.printStackTrace();
    } finally {

        try {
            if (solrClient != null)
                solrClient.commit();

            source.close();
        } catch (Throwable t) {

        }
    }

}

From source file:com.tamingtext.qa.WikipediaIndexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//from ww  w  .j  av  a 2  s .c  o m
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(i));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}