Example usage for org.apache.lucene.index IndexWriterConfig setOpenMode

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setOpenMode.

Prototype

public IndexWriterConfig setOpenMode(OpenMode openMode)

Source Link

Document

Specifies OpenMode of the index.

Usage

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();/*from   w ww . j a  v a  2 s.com*/
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file")
            .create(INPUT_OPTION));/*from  w  ww .j  a  v  a 2s.co  m*/
    options.addOption(
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("maximum number of documents to index").create(MAX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads")
            .create(THREADS_OPTION));

    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION))
            : Integer.MAX_VALUE;
    int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION))
            : DEFAULT_NUM_THREADS;

    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    FileInputStream fis = null;
    BufferedReader br = null;

    try {
        fis = new FileInputStream(new File(path));
        byte[] ignoreBytes = new byte[2];
        fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8"));

        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int cnt = 0;
        String s;
        while ((s = br.readLine()) != null) {
            Runnable worker = new AddDocumentRunnable(writer, s);
            executor.execute(worker);

            cnt++;
            if (cnt % 1000000 == 0) {
                LOG.info(cnt + " articles added");
            }
            if (cnt >= maxdocs) {
                break;
            }
        }

        executor.shutdown();
        // Wait until all threads are finish
        while (!executor.isTerminated()) {
        }

        LOG.info("Total of " + cnt + " articles indexed.");

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        out.close();
        br.close();
        fis.close();
    }
}

From source file:io.datalayer.lucene.index.IndexerMain.java

License:Apache License

/**
 * Index all text files under a directory.
 *///from w w w  . j a  v  a2s .  co  m
public static void main(String... args) {

    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles";

    String indexPath = "index";
    String docsPath = null;
    boolean create = true;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];
            i++;
        } else if ("-docs".equals(args[i])) {
            docsPath = args[i + 1];
            i++;
        } else if ("-update".equals(args[i])) {
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        LOGGER.info("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();

    try {

        LOGGER.info("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_46, analyzer);

        if (create) {
            // Create a new index in the directory, removing any previously
            // indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer. But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here. This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        LOGGER.info(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        LOGGER.info(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:io.druid.extension.lucene.LuceneDruidSegment.java

License:Apache License

private static IndexWriter buildRamWriter(RAMDirectory dir, Analyzer analyzer, int maxDocsPerSegment)
        throws IOException {
    IndexWriterConfig writerConfig = new IndexWriterConfig(analyzer);
    writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
    // some arbitrary large numbers
    writerConfig.setMaxBufferedDocs(maxDocsPerSegment * 2);
    writerConfig.setRAMBufferSizeMB(5000);
    writerConfig.setUseCompoundFile(false);
    writerConfig.setCommitOnClose(true);
    writerConfig.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    writerConfig.setMergePolicy(NoMergePolicy.INSTANCE);
    writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE);
    return new IndexWriter(dir, writerConfig);
}

From source file:io.druid.extension.lucene.LuceneDruidSegment.java

License:Apache License

private static IndexWriter buildPersistWriter(Directory dir) throws IOException {
    IndexWriterConfig writerConfig = new IndexWriterConfig(null);
    writerConfig.setUseCompoundFile(false);
    writerConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
    writerConfig.setMergePolicy(NoMergePolicy.INSTANCE);
    writerConfig.setMergeScheduler(NoMergeScheduler.INSTANCE);
    return new IndexWriter(dir, writerConfig);
}

From source file:io.github.infolis.algorithm.Indexer.java

License:Apache License

@Override
public void execute() throws IOException {
    File indexDir;/*  w w w. ja v  a  2 s .co m*/
    if (null != getExecution().getIndexDirectory() && !getExecution().getIndexDirectory().isEmpty()) {
        indexDir = new File(getExecution().getIndexDirectory());
    } else {
        indexDir = new File(
                Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), INDEX_DIR_PREFIX)
                        .toString());
        FileUtils.forceDeleteOnExit(indexDir);
    }
    log.debug("Indexing to: " + indexDir.getAbsolutePath());
    getExecution().setOutputDirectory(indexDir.getAbsolutePath().toString());

    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35, createAnalyzer());
    indexWriterConfig.setOpenMode(OpenMode.CREATE);
    FSDirectory fsIndexDir = FSDirectory.open(indexDir);

    List<InfolisFile> files = new ArrayList<>();
    for (String fileUri : getExecution().getInputFiles()) {
        try {
            files.add(this.getInputDataStoreClient().get(InfolisFile.class, fileUri));
        } catch (Exception e) {
            error(log, "Could not retrieve file " + fileUri + ": " + e.getMessage());
            getExecution().setStatus(ExecutionStatus.FAILED);
            persistExecution();
            return;
        }
    }

    Date start = new Date();
    log.debug("Starting to index");
    IndexWriter writer = new IndexWriter(fsIndexDir, indexWriterConfig);
    try {
        int counter = 0;
        for (InfolisFile file : files) {
            counter++;
            log.trace("Indexing file " + file);
            writer.addDocument(toLuceneDocument(getInputFileResolver(), file));
            updateProgress(counter, files.size());

        }
    } catch (FileNotFoundException fnfe) {
        // NOTE: at least on windows, some temporary files raise this
        // exception with an "access denied" message checking if the
        // file can be read doesn't help
        throw new RuntimeException("Could not write index entry: " + fnfe);
    } finally {
        log.debug("Merging all Lucene segments ...");
        writer.forceMerge(1);
        writer.close();
    }
    getExecution().setStatus(ExecutionStatus.FINISHED);
    fsIndexDir.close();
    log.debug(String.format("Indexing %s documents took %s ms", files.size(),
            new Date().getTime() - start.getTime()));
}

From source file:io.jpress.searcher.LuceneSearcher.java

License:LGPL

public IndexWriter createIndexWriter() throws IOException {
    if (mIndexFilePath == null) {
        throw new NullPointerException("please invoke init() method first!");
    }/*from ww w  . j a va 2  s .  c om*/

    Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);

    // ?(?): ???
    JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer;
    // ???, ?jcseg.properties?jcseg.loadsyn=1
    JcsegTaskConfig config = jcseg.getTaskConfig();
    // ?, ?jcseg.properties?jcseg.loadpinyin=1
    config.setAppendCJKSyn(true);
    // ?, com.webssky.jcseg.core.JcsegTaskConfig
    config.setAppendCJKPinyin(true);

    Directory fsDirectory = FSDirectory.open(Paths.get(mIndexFilePath));
    IndexWriterConfig indexConfig = new IndexWriterConfig(analyzer);
    indexConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    indexConfig.setMaxBufferedDocs(1000);
    IndexWriter indexWriter = new IndexWriter(fsDirectory, indexConfig);
    return indexWriter;
}

From source file:io.puntanegra.fhir.index.lucene.LuceneService.java

License:Apache License

/**
 * Builds a new {@link FSIndex}./* w ww .ja v a2  s  .co m*/
 *
 * @param name
 *            the index name
 * @param mbeanName
 *            the JMX MBean object name
 * @param path
 *            the directory path
 * @param analyzer
 *            the index writer analyzer
 * @param refresh
 *            the index reader refresh frequency in seconds
 * @param ramBufferMB
 *            the index writer RAM buffer size in MB
 * @param maxMergeMB
 *            the directory max merge size in MB
 * @param maxCachedMB
 *            the directory max cache size in MB
 * @param refreshTask
 *            action to be done during refresh
 */
public void init(String name, String mbeanName, Path path, Analyzer analyzer, double refresh, int ramBufferMB,
        int maxMergeMB, int maxCachedMB, Runnable refreshTask) {
    try {

        this.path = path;
        this.name = name;

        // Open or create directory
        FSDirectory fsDirectory = FSDirectory.open(path);
        this.directory = new NRTCachingDirectory(fsDirectory, maxMergeMB, maxCachedMB);

        // Setup index writer
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        indexWriterConfig.setRAMBufferSizeMB(ramBufferMB);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        indexWriterConfig.setUseCompoundFile(true);
        indexWriterConfig.setMergePolicy(new TieredMergePolicy());
        this.indexWriter = new IndexWriter(this.directory, indexWriterConfig);

        // Setup NRT search
        SearcherFactory searcherFactory = new SearcherFactory() {
            @Override
            public IndexSearcher newSearcher(IndexReader reader, IndexReader previousReader) {
                if (refreshTask != null) {
                    refreshTask.run();
                }
                IndexSearcher searcher = new IndexSearcher(reader);
                searcher.setSimilarity(new NoIDFSimilarity());
                return searcher;
            }
        };
        TrackingIndexWriter trackingWriter = new TrackingIndexWriter(this.indexWriter);
        this.searcherManager = new SearcherManager(this.indexWriter, true, searcherFactory);
        this.searcherReopener = new ControlledRealTimeReopenThread<>(trackingWriter, this.searcherManager,
                refresh, refresh);
        this.searcherReopener.start();

        // Register JMX MBean
        // mbean = new ObjectName(mbeanName);
        // ManagementFactory.getPlatformMBeanServer().registerMBean(service,
        // this.mbean);

    } catch (Exception e) {
        throw new FhirIndexException(e, "Error while creating index %s", name);
    }
}

From source file:is.hi.bok.deduplicator.DigestIndexer.java

License:Open Source License

/**
 * Each instance of this class wraps one Lucene index for writing deduplication information to it.
 *
 * @param indexLocation The location of the index (path).
 * @param indexingMode Index {@link #MODE_URL}, {@link #MODE_HASH} or {@link #MODE_BOTH}.
 * @param includeNormalizedURL Should a normalized version of the URL be added to the index. See
 * {@link #stripURL(String)}./*from w w  w  .ja  va2s  . c o  m*/
 * @param includeTimestamp Should a timestamp be included in the index.
 * @param includeEtag Should an Etag be included in the index.
 * @param addToExistingIndex Are we opening up an existing index. Setting this to false will cause any index at
 * <code>indexLocation</code> to be overwritten.
 * @throws IOException If an error occurs opening the index.
 */
public DigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL,
        boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException {

    this.etag = includeEtag;
    this.equivalent = includeNormalizedURL;
    this.timestamp = includeTimestamp;

    if (indexingMode.equals(MODE_URL)) {
        indexDigest = false;
    } else if (indexingMode.equals(MODE_HASH)) {
        indexURL = false;
    }

    // Set up the index writer
    IndexWriterConfig config = new IndexWriterConfig(Constants.LUCENE_VERSION,
            new WhitespaceAnalyzer(Constants.LUCENE_VERSION));
    // TODO Possibly change the default MergePolicy, see NAS-2119
    if (!addToExistingIndex) {
        config.setOpenMode(OpenMode.CREATE);
    } else {
        config.setOpenMode(OpenMode.CREATE_OR_APPEND);
    }
    luceneDirectory = FSDirectory.open(new File(indexLocation));
    index = new IndexWriter(luceneDirectory, config);
}

From source file:is.landsbokasafn.deduplicator.indexer.IndexBuilder.java

License:Apache License

/**
 * Each instance of this class wraps one Lucene index for writing 
 * deduplication information to it./*w  w  w  .j av a 2 s .c  o m*/
 * 
 * @param indexLocation The location of the index (path).
 * @param indexURL Index the URL field in the index.
 * @param includeCanonicalizedURL Should a normalized version of the URL be 
 *                             added to the index. 
 *                             See {@link #stripURL(String)}.
 * @param includeTimestamp Should a timestamp be included in the index.
 * @param includeEtag Should an Etag be included in the index.
 * @param addToExistingIndex Are we opening up an existing index. Setting
 *                           this to false will cause any index at 
 *                           <code>indexLocation</code> to be overwritten.
 * @throws IOException If an error occurs opening the index.
 */
public IndexBuilder(String indexLocation, boolean indexURL, boolean includeCanonicalizedURL,
        boolean includeEtag, boolean addToExistingIndex) throws IOException {

    this.indexURL = indexURL;
    this.includeEtag = includeEtag;
    this.includeCanonicalizedURL = includeCanonicalizedURL;

    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LUCENE_VER, new WhitespaceAnalyzer(LUCENE_VER));
    if (addToExistingIndex) {
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    } else {
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    }

    // Set up the index writer
    index = new IndexWriter(FSDirectory.open(new File(indexLocation)), indexWriterConfig);

}