Example usage for org.apache.lucene.benchmark.byTask.feeds DocData DocData

List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData DocData

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds DocData DocData.

Prototype

DocData

Source Link

Usage

From source file:com.datastax.dse.demos.solr.Wikipedia.java

License:Open Source License

public static void indexWikipedia() {

    HttpSolrServer solrClient = null;//from  w w w . ja  v  a 2 s .  com
    try {
        Properties p = new Properties();
        p.setProperty("keep.image.only.docs", "false");
        p.setProperty("docs.file", wikifile);

        Config config = new Config(p);

        source = new EnwikiContentSource();
        source.setConfig(config);
        source.resetInputs();
        solrClient = new HttpSolrServer(url);

        if (null != user && null != password) {
            AbstractHttpClient httpClient = (AbstractHttpClient) solrClient.getHttpClient();
            httpClient.addRequestInterceptor(new PreEmptiveBasicAuthenticator(user, password));
        }

        DocData docData = new DocData();
        String firstName = null;
        SolrInputDocument doc = new SolrInputDocument();
        int i = 0;
        for (int x = 0; x < limit; x++) {
            if (i > 0 && i % 1000 == 0)
                System.out.println("Indexed " + i++);

            docData = source.getNextDocData(docData);

            if (firstName == null)
                firstName = docData.getName();
            else if (firstName.equals(docData.getName()))
                break; //looped

            if (addDoc(doc, docData)) {
                solrClient.add(doc);
                i++;
            }
        }
    } catch (NoMoreDataException e) {
    } catch (Exception e) {
        e.printStackTrace();
    } finally {

        try {
            if (solrClient != null)
                solrClient.commit();

            source.close();
        } catch (Throwable t) {

        }
    }

}

From source file:com.grantingersoll.intell.index.Indexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//from   w  ww  .j av  a2 s . c  o m
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", docData.getName());
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("docnum_i", String.valueOf(i));

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}

From source file:com.tamingtext.qa.WikipediaIndexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//from  ww  w .  j  a  v a 2s .  c o m
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(i));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}

From source file:com.tamingtext.qa.WikipediaWexIndexer.java

License:Apache License

public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaWEX != null && wikipediaWEX.isFile()) {
        WexWikiContentSource contentSource = new WexWikiContentSource();
        Properties properties = new Properties();
        // fileName = config.get("docs.file", null);
        String filePath = wikipediaWEX.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//from w  w  w  . j av  a2  s. c o m
        // docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(docData.getID()));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                String[] categories = docData.getProps().getProperty("category").split(";;");

                for (String c : categories) {
                    sDoc.addField("category", c);
                }

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaWEX);
    }
    return result;
}

From source file:io.anserini.index.transform.NekoStringTransform.java

License:Apache License

@Override
public String apply(String s) {
    try {/*from  w  ww  .  ja  va  2s . c o m*/
        DocData dd = new DocData();
        dd = dhp.parse(dd, "", null, new StringReader(s), null);
        return dd.getTitle() + "\n" + dd.getBody();
    } catch (Exception e) {
        return "";
    }
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;//from   w  ww .j  av a 2 s .  c o  m
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}