Example usage for org.apache.lucene.benchmark.byTask.utils Config Config

List of usage examples for org.apache.lucene.benchmark.byTask.utils Config Config

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.utils Config Config.

Prototype

public Config(Properties props) 

Source Link

Document

Create config without algorithm - useful for a programmatic perf test.

Usage

From source file:com.datastax.dse.demos.solr.Wikipedia.java

License:Open Source License

public static void indexWikipedia() {

    HttpSolrServer solrClient = null;/*  w ww  .j  a v  a2 s  . c  o  m*/
    try {
        Properties p = new Properties();
        p.setProperty("keep.image.only.docs", "false");
        p.setProperty("docs.file", wikifile);

        Config config = new Config(p);

        source = new EnwikiContentSource();
        source.setConfig(config);
        source.resetInputs();
        solrClient = new HttpSolrServer(url);

        if (null != user && null != password) {
            AbstractHttpClient httpClient = (AbstractHttpClient) solrClient.getHttpClient();
            httpClient.addRequestInterceptor(new PreEmptiveBasicAuthenticator(user, password));
        }

        DocData docData = new DocData();
        String firstName = null;
        SolrInputDocument doc = new SolrInputDocument();
        int i = 0;
        for (int x = 0; x < limit; x++) {
            if (i > 0 && i % 1000 == 0)
                System.out.println("Indexed " + i++);

            docData = source.getNextDocData(docData);

            if (firstName == null)
                firstName = docData.getName();
            else if (firstName.equals(docData.getName()))
                break; //looped

            if (addDoc(doc, docData)) {
                solrClient.add(doc);
                i++;
            }
        }
    } catch (NoMoreDataException e) {
    } catch (Exception e) {
        e.printStackTrace();
    } finally {

        try {
            if (solrClient != null)
                solrClient.commit();

            source.close();
        } catch (Throwable t) {

        }
    }

}

From source file:com.grantingersoll.intell.index.Indexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();/* w w w. j  a va2 s . c  o m*/
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", docData.getName());
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("docnum_i", String.valueOf(i));

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}

From source file:com.tamingtext.qa.WikipediaIndexer.java

License:Apache License

public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaXML != null && wikipediaXML.exists()) {
        EnwikiContentSource contentSource = new EnwikiContentSource();
        Properties properties = new Properties();
        //fileName = config.get("docs.file", null);
        String filePath = wikipediaXML.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();//from  w w  w.ja va 2 s  .com
        //docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(i));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaXML);
    }
    return result;
}

From source file:com.tamingtext.qa.WikipediaWexIndexer.java

License:Apache License

public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception {
    int result = 0;
    if (wikipediaWEX != null && wikipediaWEX.isFile()) {
        WexWikiContentSource contentSource = new WexWikiContentSource();
        Properties properties = new Properties();
        // fileName = config.get("docs.file", null);
        String filePath = wikipediaWEX.getAbsolutePath();
        properties.setProperty("docs.file", filePath);
        properties.setProperty("doc.maker.forever", "false");
        contentSource.setConfig(new Config(properties));
        contentSource.resetInputs();/*  w ww .ja v a 2 s  .  c om*/
        // docMaker.openFile();
        List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000);
        int i = 0;
        SolrInputDocument sDoc = null;
        long start = System.currentTimeMillis();
        try {
            DocData docData = new DocData();

            while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) {
                int mod = i % batchSize;

                sDoc = new SolrInputDocument();
                docs.add(sDoc);
                sDoc.addField("file", filePath + "_" + i);

                sDoc.addField("docid", String.valueOf(docData.getID()));
                sDoc.addField("body", docData.getBody());
                sDoc.addField("doctitle", docData.getTitle());
                sDoc.addField("name_s", docData.getName());

                String[] categories = docData.getProps().getProperty("category").split(";;");

                for (String c : categories) {
                    sDoc.addField("category", c);
                }

                if (mod == batchSize - 1) {
                    log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i);
                    server.add(docs);
                    docs.clear();
                }
                i++;
            }
        } catch (NoMoreDataException e) {

        }
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("Indexing took " + (finish - start) + " ms");
        }
        if (docs.size() > 0) {
            server.add(docs);
        }
        result = i + docs.size();
        server.commit();
        server.optimize();
    } else {
        System.out.println("Can't find file: " + wikipediaWEX);
    }
    return result;
}

From source file:info.boytsov.lucene.CreateIndex.java

License:Open Source License

public static void main(String[] args) throws Exception {
    if (args.length != 3 && args.length != 4) {
        printUsage();/*ww w. ja  v  a 2s  .  c  o m*/
        System.exit(1);
    }
    String indexType = args[0];
    String indexSource = args[1];
    int commitInterval = 1000000;

    if (args.length >= 4) {
        commitInterval = Integer.parseInt(args[3]);
    }

    System.out.println("Commiting after indexing " + commitInterval + " docs");

    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        if (!outputDir.mkdirs()) {
            System.out.println("couldn't create " + outputDir.getAbsolutePath());
            return;
        }
    }
    if (!outputDir.isDirectory()) {
        System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
        return;
    }
    if (!outputDir.canWrite()) {
        System.out.println("Can't write to " + outputDir.getAbsolutePath());
        return;
    }

    FSDirectory dir = FSDirectory.open(outputDir);

    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);// default
                                                                        // stop
                                                                        // words
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);// overwrites
                                                          // if
                                                          // needed
    IndexWriter indexWriter = new IndexWriter(dir, config);

    DocMaker docMaker = new DocMaker();
    Properties properties = new Properties();
    properties.setProperty("content.source.forever", "false"); // will
                                                               // parse
                                                               // each
                                                               // document
                                                               // only
                                                               // once
    properties.setProperty("doc.index.props", "true");
    // We want to store small-size fields like URL or even title  ...
    properties.setProperty("doc.stored", "true");
    // but not the large one (great savings, 3x reduction in space)!
    properties.setProperty("doc.body.stored", "false");

    ContentSource source = CreateSource(indexType, indexSource, properties);

    if (source == null) {
        System.err.println("Failed to create a source: " + indexType + "(" + indexSource + ")");
        printUsage();
        System.exit(1);
    }

    Config c = new Config(properties);
    source.setConfig(c);
    source.resetInputs();// though this does not seem needed, it is
                         // (gets the file opened?)
    docMaker.setConfig(c, source);
    int count = 0;
    System.out.println("Starting Indexing of " + indexType + " source " + indexSource);

    long start = System.currentTimeMillis();
    Document doc;
    try {
        while ((doc = docMaker.makeDocument()) != null) {
            indexWriter.addDocument(doc);
            ++count;
            if (count % 5000 == 0) {
                System.out.println(
                        "Indexed " + count + " documents in " + (System.currentTimeMillis() - start) + " ms");
            }
            if (count % commitInterval == 0) {
                indexWriter.commit();
                System.out.println("Committed");
            }
        }
    } catch (org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException nmd) {
        System.out.println("Caught NoMoreDataException! -- Finishing"); // All done
    }
    long finish = System.currentTimeMillis();
    System.out.println("Indexing " + count + " documents took " + (finish - start) + " ms");
    System.out.println("Total data processed: " + source.getTotalBytesCount() + " bytes");
    System.out.println("Index should be located at " + dir.getDirectory().getAbsolutePath());
    docMaker.close();
    indexWriter.commit();
    indexWriter.close();

}

From source file:io.anserini.index.IndexGov2.java

License:Apache License

private static TrecContentSource createGov2Source(String dataDir) {
    TrecContentSource tcs = new TrecContentSource();
    Properties props = new Properties();
    props.setProperty("print.props", "false");
    props.setProperty("content.source.verbose", "false");
    props.setProperty("content.source.excludeIteration", "true");
    props.setProperty("docs.dir", dataDir);
    props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
    props.setProperty("content.source.forever", "false");
    tcs.setConfig(new Config(props));
    try {/*from  ww w. jav a2  s .  com*/
        tcs.resetInputs();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return tcs;
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;// www . j  a va 2s .com
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:luceneingester.TrecIngester.java

License:Apache License

private static TrecContentSource createTrecSource(String dataDir) {
    TrecContentSource tcs = new TrecContentSource();
    Properties props = new Properties();
    props.setProperty("print.props", "false");
    props.setProperty("content.source.verbose", "false");
    props.setProperty("content.source.excludeIteration", "true");
    props.setProperty("docs.dir", dataDir);
    props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
    props.setProperty("content.source.forever", "false");
    tcs.setConfig(new Config(props));
    try {/* w w  w  . j a v  a2 s. co m*/
        tcs.resetInputs();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return tcs;
}

From source file:source.ContentSourceSource.java

License:Apache License

public ContentSourceSource(String indexType, String indexSource) throws Exception {
    String typeLC = indexType.toUpperCase();
    mProperties = new Properties();

    // prevent an infinite parsing loop, which is a strange default here
    mProperties.setProperty("content.source.forever", "false");

    if (typeLC.equals(SOURCE_TYPE_WIKIPEDIA)) {
        File wikipediafile = new File(indexSource);
        if (!wikipediafile.exists()) {
            throw new Exception("Can't find " + wikipediafile.getAbsolutePath());
        }// w w w. ja v  a2s.  co m
        if (!wikipediafile.canRead()) {
            throw new Exception("Can't read " + wikipediafile.getAbsolutePath());
        }

        mProperties.setProperty("docs.file", wikipediafile.getAbsolutePath());
        mProperties.setProperty("keep.image.only.docs", "false");

        mSource = new EnwikiContentSource();
    } else if (typeLC.equals(SOURCE_TYPE_GOV2)) {
        String parserTREC = "parsers.TrecGov2Parser";

        mProperties.setProperty("html.parser", "parsers.DemoHTMLParser");
        mProperties.setProperty("trec.doc.parser", parserTREC);
        mProperties.setProperty("docs.dir", indexSource);
        mProperties.setProperty("work.dir", "/tmp");

        mSource = new TrecContentSource();
    } else if (typeLC.equals(SOURCE_TYPE_CLUEWEB)) {
        // parsers.DemoHTMLParser HTML parser fails on this collection
        //mProperties.setProperty("html.parser", "parsers.LeoHTMLParser");
        mProperties.setProperty("html.parser", "parsers.DemoHTMLParser");
        mProperties.setProperty("docs.dir", indexSource);
        mProperties.setProperty("work.dir", "/tmp");

        mSource = new ClueWebContentSource();
    } else {
        throw new Exception("Unsupported index type: " + indexType);
    }

    mConfig = new Config(mProperties);
    mSource.setConfig(mConfig);
    mSource.resetInputs(); // not clear if this is 100% needed, but let's keep it    
}