Example usage for org.apache.lucene.benchmark.byTask.feeds DocData getBody

List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData getBody

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds DocData getBody.

Prototype

public String getBody() 

Source Link

Usage

From source file:com.datastax.dse.demos.solr.Wikipedia.java

License:Open Source License

public static boolean addDoc(SolrInputDocument doc, DocData d) {

    if (d.getTitle().indexOf(":") > 0)
        return false;

    doc.clear();//from ww  w  . j a v  a2s .  c  o  m
    doc.addField("id", d.getName());
    doc.addField("title", d.getTitle());
    doc.addField("body", d.getBody());
    doc.addField("date", d.getDate());

    return true;
}

From source file:com.grantingersoll.intell.index.Indexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//from w w  w.  j  a v  a2  s .  c  o m
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", docData.getName());
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("docnum_i", String.valueOf(i));

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}

From source file:com.tamingtext.qa.WikipediaIndexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();/*from   w w  w .  jav a2 s . c o  m*/
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(i));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}

From source file:com.tamingtext.qa.WikipediaWexIndexer.java

License:Apache License

public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaWEX != null && wikipediaWEX.isFile()) {
        WexWikiContentSource contentSource = new WexWikiContentSource();
        Properties properties = new Properties();
        // fileName = config.get("docs.file", null);
        String filePath = wikipediaWEX.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();/*from  www .  ja  va  2  s  .  c  om*/
        // docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(docData.getID()));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                String[] categories = docData.getProps().getProperty("category").split(";;");

                for (String c : categories) {
                    sDoc.addField("category", c);
                }

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaWEX);
    }
    return result;
}

From source file:io.anserini.index.transform.NekoStringTransform.java

License:Apache License

@Override
public String apply(String s) {
    try {//from  w ww  .  j a v  a2 s . c  o m
        DocData dd = new DocData();
        dd = dhp.parse(dd, "", null, new StringReader(s), null);
        return dd.getTitle() + "\n" + dd.getBody();
    } catch (Exception e) {
        return "";
    }
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static Document getDocumentFromDocData(DocData docData, FieldType bodyFieldType) {

    Document doc = new Document();

    // add identifier field
    doc.add(new StringField(BuildIndex.ID_FIELD_NAME, docData.getName(), Field.Store.YES));

    // add body field
    doc.add(new Field(BuildIndex.BODY_FIELD_NAME, docData.getBody(), bodyFieldType));

    return doc;/*from  w w  w  . ja  va  2s  . c o m*/

}