Example usage for org.apache.lucene.index IndexWriter forceMerge

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter forceMerge.

Prototype

public void forceMerge(int maxNumSegments) throws IOException

Source Link

Document

Forces merge policy to merge segments until there are <= maxNumSegments .

Usage

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);//  w  ww. j  av a  2s.co  m
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}

From source file:io.anserini.index.IndexGov2.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {

    IndexArgs indexArgs = new IndexArgs();

    CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//  w w w. j  a va  2 s.c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: IndexGov2" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    final String dirPath = indexArgs.index;
    final String dataDir = indexArgs.input;
    final int docCountLimit = indexArgs.doclimit;
    final int numThreads = indexArgs.threads;

    final boolean positions = indexArgs.positions;
    final boolean optimize = indexArgs.optimize;

    final Analyzer a = new EnglishAnalyzer();
    final TrecContentSource trecSource = createGov2Source(dataDir);
    final Directory dir = FSDirectory.open(Paths.get(dirPath));

    LOG.info("Index path: " + dirPath);
    LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit));
    LOG.info("Threads: " + numThreads);
    LOG.info("Positions: " + positions);
    LOG.info("Optimize (merge segments): " + optimize);

    final IndexWriterConfig config = new IndexWriterConfig(a);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    final IndexWriter writer = new IndexWriter(dir, config);
    Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit);
    LOG.info("Indexer: start");

    final long t0 = System.currentTimeMillis();

    threads.start();

    while (!threads.done()) {
        Thread.sleep(100);
    }
    threads.stop();

    final long t1 = System.currentTimeMillis();
    LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs");
    if (docCountLimit != -1 && writer.maxDoc() != docCountLimit) {
        throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit);
    }
    if (threads.failed.get()) {
        throw new RuntimeException("exceptions during indexing");
    }

    final long t2;
    t2 = System.currentTimeMillis();

    final Map<String, String> commitData = new HashMap<String, String>();
    commitData.put("userData", "multi");
    writer.setCommitData(commitData);
    writer.commit();
    final long t3 = System.currentTimeMillis();
    LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)");

    if (optimize) {
        LOG.info("Indexer: merging all segments");
        writer.forceMerge(1);
        final long t4 = System.currentTimeMillis();
        LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)");
    }

    // TODO: Doesn't compile for 5.3.1 - do we actually need this?
    //LOG.info("Indexer: at close: " + writer.segString());
    final long tCloseStart = System.currentTimeMillis();
    writer.close();
    LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec");
    dir.close();
    final long tFinal = System.currentTimeMillis();
    LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)");
    LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed());
    LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.))
            + " GB/hour plain text");
}

From source file:io.anserini.index.IndexTweets.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory")
            .create(COLLECTION_OPTION));
    options.addOption(/* w w  w.ja  v a  2 s.  co  m*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweets.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);
    }

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    LongOpenHashSet deletes = null;
    if (cmdline.hasOption(DELETES_OPTION)) {
        deletes = new LongOpenHashSet();
        File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
        if (!deletesFile.exists()) {
            System.err.println("Error: " + deletesFile + " does not exist!");
            System.exit(-1);
        }
        LOG.info("Reading deletes from " + deletesFile);

        FileInputStream fin = new FileInputStream(deletesFile);
        byte[] ignoreBytes = new byte[2];
        fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));

        String s;
        while ((s = br.readLine()) != null) {
            if (s.contains("\t")) {
                deletes.add(Long.parseLong(s.split("\t")[0]));
            } else {
                deletes.add(Long.parseLong(s));
            }
        }
        br.close();
        fin.close();
        LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
    }

    long maxId = Long.MAX_VALUE;
    if (cmdline.hasOption(MAX_ID_OPTION)) {
        maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
        LOG.info("index: " + maxId);
    }

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Directory dir = FSDirectory.open(Paths.get(indexPath));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {
            if (status.getText() == null) {
                continue;
            }

            // Skip deletes tweetids.
            if (deletes != null && deletes.contains(status.getId())) {
                continue;
            }

            if (status.getId() > maxId) {
                continue;
            }

            cnt++;
            Document doc = new Document();
            doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
            doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

            doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
            doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
            doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                        Field.Store.YES));
                doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                        Field.Store.YES));
            }

            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }

            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
                doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                        Field.Store.YES));
                doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    LOG.warn("Error parsing retweet fields of " + status.getId());
                }
            }

            writer.addDocument(doc);
            if (cnt % 100000 == 0) {
                LOG.info(cnt + " statuses indexed");
            }
        }

        LOG.info(String.format("Total of %s statuses added", cnt));

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:io.anserini.index.IndexWebCollection.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(512);/* w  w w  .  j a  va2 s.  co  m*/
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz";
    final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix);

    if (doclimit > 0 && warcFiles.size() < doclimit)
        for (int i = doclimit; i < warcFiles.size(); i++)
            warcFiles.removeFirst();

    long totalWarcFiles = warcFiles.size();
    LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString());

    for (int i = 0; i < 2000; i++) {
        if (!warcFiles.isEmpty())
            executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
        else {
            if (!executor.isShutdown()) {
                Thread.sleep(30000);
                executor.shutdown();
            }
            break;
        }
    }

    long first = 0;
    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {

            final long completedTaskCount = executor.getCompletedTaskCount();

            LOG.info(String.format("%.2f percentage completed",
                    (double) completedTaskCount / totalWarcFiles * 100.0d));

            if (!warcFiles.isEmpty())
                for (long i = first; i < completedTaskCount; i++) {
                    if (!warcFiles.isEmpty())
                        executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
                    else {
                        if (!executor.isShutdown())
                            executor.shutdown();
                    }
                }

            first = completedTaskCount;
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (totalWarcFiles != executor.getCompletedTaskCount())
        throw new RuntimeException("totalWarcFiles = " + totalWarcFiles
                + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void buildTestIndex() throws IOException {
    Directory dir = FSDirectory.open(tempDir1);

    Analyzer analyzer = new EnglishAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);

    FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    textOptions.setStored(true);/*from  w w  w  . j  a v a 2 s. c o  m*/
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);
    textOptions.setStoreTermVectorPositions(true);

    Document doc1 = new Document();
    doc1.add(new StringField("docid", "doc1", Field.Store.YES));
    doc1.add(new Field("text", "here is some text here is some more text", textOptions));
    writer.addDocument(doc1);

    Document doc2 = new Document();
    doc2.add(new StringField("docid", "doc2", Field.Store.YES));
    doc2.add(new Field("text", "more text", textOptions));
    writer.addDocument(doc2);

    Document doc3 = new Document();
    doc3.add(new StringField("docid", "doc3", Field.Store.YES));
    doc3.add(new Field("text", "here is a test", textOptions));
    writer.addDocument(doc3);

    writer.commit();
    writer.forceMerge(1);
    writer.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();/*from   w  ww . j  a  va  2  s .co m*/
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.bfscan.clueweb12.BuildWarcTrecIdMapping.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file")
            .create(INPUT_OPTION));/*from   w  w w  .j  a v a 2 s.c om*/
    options.addOption(
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("maximum number of documents to index").create(MAX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads")
            .create(THREADS_OPTION));

    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(BuildWarcTrecIdMapping.class.getCanonicalName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION))
            : Integer.MAX_VALUE;
    int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION))
            : DEFAULT_NUM_THREADS;

    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    FileInputStream fis = null;
    BufferedReader br = null;

    try {
        fis = new FileInputStream(new File(path));
        byte[] ignoreBytes = new byte[2];
        fis.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fis), "UTF8"));

        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int cnt = 0;
        String s;
        while ((s = br.readLine()) != null) {
            Runnable worker = new AddDocumentRunnable(writer, s);
            executor.execute(worker);

            cnt++;
            if (cnt % 1000000 == 0) {
                LOG.info(cnt + " articles added");
            }
            if (cnt >= maxdocs) {
                break;
            }
        }

        executor.shutdown();
        // Wait until all threads are finish
        while (!executor.isTerminated()) {
        }

        LOG.info("Total of " + cnt + " articles indexed.");

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        out.close();
        br.close();
        fis.close();
    }
}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

/**
 * #1 Index contains deletions/*from  www. j a  v a 2s .  c  o  m*/
 * 
 * #2 1 indexed document, 1 deleted document
 * 
 * #3 Optimize compacts deletes
 */
@Test
public void testDeleteAfterOptimize() throws IOException {
    IndexWriter writer = getWriter();
    assertEquals(2, writer.numDocs());
    writer.deleteDocuments(new Term("id", "1"));
    writer.forceMerge(1);
    writer.commit();
    assertFalse(writer.hasDeletions());
    assertEquals(1, writer.maxDoc());
    assertEquals(1, writer.numDocs());
    writer.close();
}

From source file:io.datalayer.lucene.read.LuceneReaderTest.java

License:Apache License

private static void addDocuments(Directory dir) throws IOException {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, getAnalyzer());
    IndexWriter writer = new IndexWriter(dir, config);
    for (int i = 0; i < keywords.length; i++) {
        Document doc = new Document();
        doc.add(new Field("id", keywords[i], AosFieldType.INDEXED_STORED_TERMVECTORS));
        doc.add(new Field("city", text[i], AosFieldType.INDEXED_STORED_TERMVECTORS));
        doc.add(new Field("country", unindexed[i], AosFieldType.INDEXEDNOT_STORED_TERMVECTORSNOT));
        doc.add(new Field("contents", unstored[i], AosFieldType.INDEXED_STOREDNOT_TERMVECTORS));
        writer.addDocument(doc);//from  w  ww  .j  av  a  2  s.c  om
    }
    writer.forceMerge(1);
    writer.close();
}

From source file:io.github.infolis.algorithm.Indexer.java

License:Apache License

@Override
public void execute() throws IOException {
    File indexDir;/*from  w w w  .  jav  a  2s .  c  om*/
    if (null != getExecution().getIndexDirectory() && !getExecution().getIndexDirectory().isEmpty()) {
        indexDir = new File(getExecution().getIndexDirectory());
    } else {
        indexDir = new File(
                Files.createTempDirectory(InfolisConfig.getTmpFilePath().toAbsolutePath(), INDEX_DIR_PREFIX)
                        .toString());
        FileUtils.forceDeleteOnExit(indexDir);
    }
    log.debug("Indexing to: " + indexDir.getAbsolutePath());
    getExecution().setOutputDirectory(indexDir.getAbsolutePath().toString());

    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35, createAnalyzer());
    indexWriterConfig.setOpenMode(OpenMode.CREATE);
    FSDirectory fsIndexDir = FSDirectory.open(indexDir);

    List<InfolisFile> files = new ArrayList<>();
    for (String fileUri : getExecution().getInputFiles()) {
        try {
            files.add(this.getInputDataStoreClient().get(InfolisFile.class, fileUri));
        } catch (Exception e) {
            error(log, "Could not retrieve file " + fileUri + ": " + e.getMessage());
            getExecution().setStatus(ExecutionStatus.FAILED);
            persistExecution();
            return;
        }
    }

    Date start = new Date();
    log.debug("Starting to index");
    IndexWriter writer = new IndexWriter(fsIndexDir, indexWriterConfig);
    try {
        int counter = 0;
        for (InfolisFile file : files) {
            counter++;
            log.trace("Indexing file " + file);
            writer.addDocument(toLuceneDocument(getInputFileResolver(), file));
            updateProgress(counter, files.size());

        }
    } catch (FileNotFoundException fnfe) {
        // NOTE: at least on windows, some temporary files raise this
        // exception with an "access denied" message checking if the
        // file can be read doesn't help
        throw new RuntimeException("Could not write index entry: " + fnfe);
    } finally {
        log.debug("Merging all Lucene segments ...");
        writer.forceMerge(1);
        writer.close();
    }
    getExecution().setStatus(ExecutionStatus.FINISHED);
    fsIndexDir.close();
    log.debug(String.format("Indexing %s documents took %s ms", files.size(),
            new Date().getTime() - start.getTime()));
}