Example usage for org.apache.lucene.index IndexWriter forceMerge

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter forceMerge.

Prototype

public void forceMerge(int maxNumSegments) throws IOException

Source Link

Document

Forces merge policy to merge segments until there are <= maxNumSegments .

Usage

From source file:org.opensolaris.opengrok.index.IndexDatabase.java

License:Open Source License

/**
 * Optimize the index database//from ww  w  .j  a v  a2 s. co  m
 */
public void optimize() {
    synchronized (lock) {
        if (running) {
            log.warning("Optimize terminated... Someone else is updating / optimizing it!");
            return;
        }
        running = true;
    }
    IndexWriter wrt = null;
    try {
        log.info("Optimizing the index ... ");
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig conf = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer);
        conf.setOpenMode(OpenMode.CREATE_OR_APPEND);

        wrt = new IndexWriter(indexDirectory, conf);
        wrt.forceMerge(1); // this is deprecated and not needed anymore            
        log.info("done");
        synchronized (lock) {
            if (dirtyFile.exists() && !dirtyFile.delete()) {
                log.log(Level.FINE, "Failed to remove \"dirty-file\": {0}", dirtyFile.getAbsolutePath());
            }
            dirty = false;
        }
    } catch (IOException e) {
        log.log(Level.SEVERE, "ERROR: optimizing index: {0}", e);
    } finally {
        if (wrt != null) {
            try {
                wrt.close();
            } catch (IOException e) {
                log.log(Level.WARNING, "An error occured while closing writer", e);
            }
        }
        synchronized (lock) {
            running = false;
        }
    }
}

From source file:org.punksearch.crawler.IndexOperator.java

License:Open Source License

public static void optimize(String dir) {
    IndexWriter iw;
    try {/*  w  w w  . ja  v  a2  s.  c om*/
        iw = createIndexWriter(dir);
        iw.forceMerge(5);
        iw.close();
    } catch (IOException e) {
        log.error("Exception during optimizing index directory '" + dir + "': " + e.getMessage());
    }
}

From source file:org.scify.NewSumServer.Server.Searching.Indexer.java

License:Apache License

/**
 * The Main method of the Indexer Class.
 * Traverses a directory and creates the index files needed for the package to
 * operate.//  w w  w.  j  ava2s .  com
 * @throws CorruptIndexException
 * @throws LockObtainFailedException
 * @throws IOException
 */
public void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
    // The dir containing the Files to Index
    File docDir = new File(this.sFilesPath);
    Directory FSDir = FSDirectory.open(indexDir);
    //init the Analyzer, according to locale
    if (lLoc.toString().equals("el")) {
        anal = new GreekAnalyzer(Version.LUCENE_36);
    } else if (lLoc.toString().equals("en")) {
        // The standard analyzer
        Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36);
        // In order to index all the text in a field,
        // however long that field may be
        anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE);
    }
    // The configuration for the Index Writer
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, anal);
    conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    // The Index Writer
    IndexWriter indexWriter = new IndexWriter(FSDir, conf);
    // For each File in the dir, create a Document
    for (File file : getFilesFromFirstLeverSubdirs(docDir)) {
        String filename = file.getName();
        String fullFileName = file.getAbsolutePath();
        String tmpText = Utilities.readFromFile(fullFileName, " ");
        Document d = new Document(); //lucene Document
        // Add the "filename" field
        d.add(new Field(FILE_FIELD, filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
        // Add The "Text" Field
        d.add(new Field(TEXT_FIELD, tmpText, Field.Store.YES, Field.Index.ANALYZED));

        // Add the Document to the Writer
        indexWriter.addDocument(d);
    }
    int numDocs = indexWriter.numDocs();
    // the index will be merged down into a single segment, resulting in
    // a smaller index with better search performance. Costly Operation,
    // DO NOT USE on large dirs or when low disk space (needs (2-3)*DirSize)
    indexWriter.forceMerge(1);
    // Syncs All referenced Index Files.
    // At this point old indexes will be deleted, freeing up space
    indexWriter.commit();
    // Terminate the Writer appropriately
    indexWriter.close();
    //        LOGGER.log(Level.INFO, "Succesfully closed indexWriter with {0}", anal.toString());
}

From source file:org.voyanttools.trombone.input.index.LuceneIndexer.java

License:Open Source License

private void indexStream(Collection<StoredDocumentSource> storedDocumentSourceForLucene, String corpusId)
        throws CorruptIndexException, LockObtainFailedException, IOException {
    // index documents (or at least add corpus to document if not already there), we need to get a new writer
    IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(corpusId);
    DirectoryReader indexReader = DirectoryReader.open(indexWriter);
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    boolean verbose = parameters.getParameterBooleanValue("verbose");
    try {/*from  ww w .ja  v a2  s  .  c om*/
        storedDocumentSourceForLucene.parallelStream().forEach(storedDocumentSource -> {
            Runnable runnable;
            try {
                runnable = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher,
                        storedDocumentSource, corpusId, verbose);
                runnable.run();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        });
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (parameters.containsKey("forceMerge")) {
        indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge"));
    }

    indexReader = DirectoryReader.open(indexWriter);
    storage.getLuceneManager().setDirectoryReader(corpusId, indexReader); // make sure it's available afterwards            

    // now determine which documents need to be analyzed
    Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>();
    for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
        if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
            storedDocumentSourceForAnalysis.add(storedDocumentSource);
        }
    }

    if (storedDocumentSourceForAnalysis.isEmpty() == false) {
        IndexSearcher indexSearcher2 = new IndexSearcher(indexReader);
        try {
            storedDocumentSourceForAnalysis.parallelStream().forEach(storedDocumentSource -> {
                if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
                    Runnable worker;
                    try {
                        worker = new IndexedDocumentAnalyzer(storage, indexSearcher2, storedDocumentSource,
                                corpusId, verbose);
                        worker.run();
                    } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            });
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

From source file:org.voyanttools.trombone.input.index.LuceneIndexer.java

License:Open Source License

private void indexExecutorService(Collection<StoredDocumentSource> storedDocumentSourceForLucene,
        String corpusId) throws CorruptIndexException, LockObtainFailedException, IOException {
    // index documents (or at least add corpus to document if not already there), we need to get a new writer
    IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(corpusId);
    DirectoryReader indexReader = DirectoryReader.open(indexWriter);
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
    boolean verbose = parameters.getParameterBooleanValue("verbose");
    int processors = Runtime.getRuntime().availableProcessors();
    ExecutorService executor;//from  w  w w .  jav a 2s  .c o  m

    // index
    executor = Executors.newFixedThreadPool(processors);
    for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
        Runnable worker = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher,
                storedDocumentSource, corpusId, verbose);
        try {
            executor.execute(worker);
        } catch (Exception e) {
            executor.shutdown();
            throw e;
        }
    }
    executor.shutdown();
    try {
        if (!executor.awaitTermination(parameters.getParameterIntValue("luceneIndexingTimeout", 60 * 10),
                TimeUnit.SECONDS)) { // default 10 minutes
            executor.shutdownNow();
            throw new InterruptedException("Lucene indexing has run out of time.");
        }
    } catch (InterruptedException e) {
        executor.shutdownNow();
        Thread.currentThread().interrupt();
        throw new RuntimeException("Lucene indexing has been interrupted.", e);
    } finally {

        try {
            indexWriter.commit();
        } catch (IOException e) {
            indexWriter.close(); // this may also throw an exception, but docs say to close on commit error
            throw e;
        }
    }

    if (parameters.containsKey("forceMerge")) {
        indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge"));
    }

    indexReader = DirectoryReader.open(indexWriter);
    storage.getLuceneManager().setDirectoryReader(corpusId, indexReader); // make sure it's available afterwards            

    // now determine which documents need to be analyzed
    Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>();
    for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) {
        if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
            storedDocumentSourceForAnalysis.add(storedDocumentSource);
        }
    }

    if (storedDocumentSourceForAnalysis.isEmpty() == false) {
        indexSearcher = new IndexSearcher(indexReader);
        executor = Executors.newFixedThreadPool(processors);
        for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForAnalysis) {
            if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze
                Runnable worker = new IndexedDocumentAnalyzer(storage, indexSearcher, storedDocumentSource,
                        corpusId, verbose);
                executor.execute(worker);
            }
        }
        executor.shutdown();
        try {
            if (!executor.awaitTermination(parameters.getParameterIntValue("luceneAnalysisTimeout", 60 * 10),
                    TimeUnit.SECONDS)) { // default 10 minutes
                throw new InterruptedException("Lucene analysis has run out of time.");
            }
        } catch (InterruptedException e) {
            throw new RuntimeException("Lucene document analysis run out of time", e);
        }
    }

}

From source file:perf.IndexAndSearchOpenStreetMaps.java

License:Apache License

private static void createIndex(boolean fast, boolean doForceMerge, boolean doDistanceSort)
        throws IOException, InterruptedException {

    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    int BUFFER_SIZE = 1 << 16; // 64K
    InputStream is;/*from  w w w .j av  a 2  s .co m*/
    if (SMALL) {
        is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.subsetPlusAllLondon.txt"));
    } else {
        is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.txt"));
    }
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

    int NUM_THREADS;
    if (fast) {
        NUM_THREADS = 4;
    } else {
        NUM_THREADS = 1;
    }

    int CHUNK = 10000;

    long t0 = System.nanoTime();
    AtomicLong totalCount = new AtomicLong();

    for (int part = 0; part < NUM_PARTS; part++) {
        Directory dir = FSDirectory.open(Paths.get(getName(part, doDistanceSort)));

        IndexWriterConfig iwc = new IndexWriterConfig(null);
        iwc.setCodec(getCodec(fast));
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        if (fast) {
            ((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(Double.POSITIVE_INFINITY);
            iwc.setRAMBufferSizeMB(1024);
        } else {
            iwc.setMaxBufferedDocs(109630);
            iwc.setMergePolicy(new LogDocMergePolicy());
            iwc.setMergeScheduler(new SerialMergeScheduler());
        }
        iwc.setInfoStream(new PrintStreamInfoStream(System.out));
        IndexWriter w = new IndexWriter(dir, iwc);

        Thread[] threads = new Thread[NUM_THREADS];
        AtomicBoolean finished = new AtomicBoolean();
        Object lock = new Object();

        final int finalPart = part;

        for (int t = 0; t < NUM_THREADS; t++) {
            threads[t] = new Thread() {
                @Override
                public void run() {
                    String[] lines = new String[CHUNK];
                    int chunkCount = 0;
                    while (finished.get() == false) {
                        try {
                            int count = CHUNK;
                            synchronized (lock) {
                                for (int i = 0; i < CHUNK; i++) {
                                    String line = reader.readLine();
                                    if (line == null) {
                                        count = i;
                                        finished.set(true);
                                        break;
                                    }
                                    lines[i] = line;
                                }
                                if (finalPart == 0 && totalCount.get() + count >= 2000000000) {
                                    finished.set(true);
                                }
                            }

                            for (int i = 0; i < count; i++) {
                                String[] parts = lines[i].split(",");
                                //long id = Long.parseLong(parts[0]);
                                double lat = Double.parseDouble(parts[1]);
                                double lon = Double.parseDouble(parts[2]);
                                Document doc = new Document();
                                if (useGeoPoint) {
                                    doc.add(new GeoPointField("point", lat, lon, Field.Store.NO));
                                } else if (useGeo3D || useGeo3DLarge) {
                                    doc.add(new Geo3DPoint("point", lat, lon));
                                } else {
                                    doc.add(new LatLonPoint("point", lat, lon));
                                    if (doDistanceSort) {
                                        doc.add(new LatLonDocValuesField("point", lat, lon));
                                    }
                                }
                                w.addDocument(doc);
                                long x = totalCount.incrementAndGet();
                                if (x % 1000000 == 0) {
                                    System.out.println(x + "...");
                                }
                            }
                            chunkCount++;
                            if (false && SMALL == false && chunkCount == 20000) {
                                System.out.println("NOW BREAK EARLY");
                                break;
                            }
                        } catch (IOException ioe) {
                            throw new RuntimeException(ioe);
                        }
                    }
                }
            };
            threads[t].start();
        }

        for (Thread thread : threads) {
            thread.join();
        }

        System.out.println("Part " + part + " is done: w.maxDoc()=" + w.maxDoc());
        w.commit();
        System.out.println("done commit");
        long t1 = System.nanoTime();
        System.out.println(((t1 - t0) / 1000000000.0) + " sec to index part " + part);
        if (doForceMerge) {
            w.forceMerge(1);
            long t2 = System.nanoTime();
            System.out.println(((t2 - t1) / 1000000000.0) + " sec to force merge part " + part);
        }
        w.close();
    }

    //System.out.println(totalCount.get() + " total docs");
    //System.out.println("Force merge...");
    //w.forceMerge(1);
    //long t2 = System.nanoTime();
    //System.out.println(((t2-t1)/1000000000.0) + " sec to force merge");

    //w.close();
    //long t3 = System.nanoTime();
    //System.out.println(((t3-t2)/1000000000.0) + " sec to close");
    //System.out.println(((t3-t2)/1000000000.0) + " sec to close");
}

From source file:perf.Indexer.java

License:Apache License

private static void _main(String[] clArgs) throws Exception {

    Args args = new Args(clArgs);

    // EG: -facets Date -facets characterCount ...
    FacetsConfig facetsConfig = new FacetsConfig();
    facetsConfig.setHierarchical("Date", true);
    final Set<String> facetFields = new HashSet<String>();
    if (args.hasArg("-facets")) {
        for (String arg : args.getStrings("-facets")) {
            facetFields.add(arg);/*from  w w w .  j  a  v a 2  s .c  o  m*/
        }
    }

    final String dirImpl = args.getString("-dirImpl");
    final String dirPath = args.getString("-indexPath") + "/index";

    final Directory dir;
    OpenDirectory od = OpenDirectory.get(dirImpl);

    dir = od.open(Paths.get(dirPath));

    final String analyzer = args.getString("-analyzer");
    final Analyzer a;
    if (analyzer.equals("EnglishAnalyzer")) {
        a = new EnglishAnalyzer();
    } else if (analyzer.equals("StandardAnalyzer")) {
        a = new StandardAnalyzer();
    } else if (analyzer.equals("StandardAnalyzerNoStopWords")) {
        a = new StandardAnalyzer(CharArraySet.EMPTY_SET);
    } else if (analyzer.equals("ShingleStandardAnalyzer")) {
        a = new ShingleAnalyzerWrapper(new StandardAnalyzer(), 2, 2);
    } else if (analyzer.equals("ShingleStandardAnalyzerNoStopWords")) {
        a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2);
    } else {
        throw new RuntimeException("unknown analyzer " + analyzer);
    }

    final String lineFile = args.getString("-lineDocsFile");

    // -1 means all docs in the line file:
    final int docCountLimit = args.getInt("-docCountLimit");
    final int numThreads = args.getInt("-threadCount");

    final boolean doForceMerge = args.getFlag("-forceMerge");
    final boolean verbose = args.getFlag("-verbose");

    String indexSortField = null;
    SortField.Type indexSortType = null;

    if (args.hasArg("-indexSort")) {
        indexSortField = args.getString("-indexSort");

        int i = indexSortField.indexOf(':');
        if (i == -1) {
            throw new IllegalArgumentException(
                    "-indexSort should have form field:type; got: " + indexSortField);
        }
        String typeString = indexSortField.substring(i + 1, indexSortField.length());
        if (typeString.equals("long")) {
            indexSortType = SortField.Type.LONG;
        } else if (typeString.equals("string")) {
            indexSortType = SortField.Type.STRING;
        } else {
            throw new IllegalArgumentException("-indexSort can only handle 'long' sort; got: " + typeString);
        }
        indexSortField = indexSortField.substring(0, i);
    } else {
        indexSortType = null;
    }

    final double ramBufferSizeMB = args.getDouble("-ramBufferMB");
    final int maxBufferedDocs = args.getInt("-maxBufferedDocs");

    final String defaultPostingsFormat = args.getString("-postingsFormat");
    final boolean doDeletions = args.getFlag("-deletions");
    final boolean printDPS = args.getFlag("-printDPS");
    final boolean waitForMerges = args.getFlag("-waitForMerges");
    final boolean waitForCommit = args.getFlag("-waitForCommit");
    final String mergePolicy = args.getString("-mergePolicy");
    final Mode mode;
    final boolean doUpdate = args.getFlag("-update");
    if (doUpdate) {
        mode = Mode.UPDATE;
    } else {
        mode = Mode.valueOf(args.getString("-mode", "add").toUpperCase(Locale.ROOT));
    }
    int randomDocIDMax;
    if (mode == Mode.UPDATE) {
        randomDocIDMax = args.getInt("-randomDocIDMax");
    } else {
        randomDocIDMax = -1;
    }
    final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat");
    final boolean addGroupingFields = args.getFlag("-grouping");
    final boolean useCFS = args.getFlag("-cfs");
    final boolean storeBody = args.getFlag("-store");
    final boolean tvsBody = args.getFlag("-tvs");
    final boolean bodyPostingsOffsets = args.getFlag("-bodyPostingsOffsets");
    final int maxConcurrentMerges = args.getInt("-maxConcurrentMerges");
    final boolean addDVFields = args.getFlag("-dvfields");
    final boolean doRandomCommit = args.getFlag("-randomCommit");
    final boolean useCMS = args.getFlag("-useCMS");
    final boolean disableIOThrottle = args.getFlag("-disableIOThrottle");

    if (waitForCommit == false && waitForMerges) {
        throw new RuntimeException("pass -waitForCommit if you pass -waitForMerges");
    }

    if (waitForCommit == false && doForceMerge) {
        throw new RuntimeException("pass -waitForCommit if you pass -forceMerge");
    }

    if (waitForCommit == false && doDeletions) {
        throw new RuntimeException("pass -waitForCommit if you pass -deletions");
    }

    if (useCMS == false && disableIOThrottle) {
        throw new RuntimeException("-disableIOThrottle only makes sense with -useCMS");
    }

    final double nrtEverySec;
    if (args.hasArg("-nrtEverySec")) {
        nrtEverySec = args.getDouble("-nrtEverySec");
    } else {
        nrtEverySec = -1.0;
    }

    // True to start back at the beginning if we run out of
    // docs from the line file source:
    final boolean repeatDocs = args.getFlag("-repeatDocs");

    final String facetDVFormatName;
    if (facetFields.isEmpty()) {
        facetDVFormatName = "Lucene54";
    } else {
        facetDVFormatName = args.getString("-facetDVFormat");
    }

    if (addGroupingFields && docCountLimit == -1) {
        a.close();
        throw new RuntimeException("cannot add grouping fields unless docCount is set");
    }

    args.check();

    System.out.println("Dir: " + dirImpl);
    System.out.println("Index path: " + dirPath);
    System.out.println("Analyzer: " + analyzer);
    System.out.println("Line file: " + lineFile);
    System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit));
    System.out.println("Threads: " + numThreads);
    System.out.println("Force merge: " + (doForceMerge ? "yes" : "no"));
    System.out.println("Verbose: " + (verbose ? "yes" : "no"));
    System.out.println("RAM Buffer MB: " + ramBufferSizeMB);
    System.out.println("Max buffered docs: " + maxBufferedDocs);
    System.out.println("Default postings format: " + defaultPostingsFormat);
    System.out.println("Do deletions: " + (doDeletions ? "yes" : "no"));
    System.out.println("Wait for merges: " + (waitForMerges ? "yes" : "no"));
    System.out.println("Wait for commit: " + (waitForCommit ? "yes" : "no"));
    System.out.println("IO throttle: " + (disableIOThrottle ? "no" : "yes"));
    System.out.println("Merge policy: " + mergePolicy);
    System.out.println("Mode: " + mode);
    if (mode == Mode.UPDATE) {
        System.out.println("DocIDMax: " + randomDocIDMax);
    }
    System.out.println("ID field postings format: " + idFieldPostingsFormat);
    System.out.println("Add grouping fields: " + (addGroupingFields ? "yes" : "no"));
    System.out.println("Compound file format: " + (useCFS ? "yes" : "no"));
    System.out.println("Store body field: " + (storeBody ? "yes" : "no"));
    System.out.println("Term vectors for body field: " + (tvsBody ? "yes" : "no"));
    System.out.println("Facet DV Format: " + facetDVFormatName);
    System.out.println("Facet fields: " + facetFields);
    System.out.println("Body postings offsets: " + (bodyPostingsOffsets ? "yes" : "no"));
    System.out.println("Max concurrent merges: " + maxConcurrentMerges);
    System.out.println("Add DocValues fields: " + addDVFields);
    System.out.println("Use ConcurrentMergeScheduler: " + useCMS);
    if (nrtEverySec > 0.0) {
        System.out.println("Open & close NRT reader every: " + nrtEverySec + " sec");
    } else {
        System.out.println("Open & close NRT reader every: never");
    }
    System.out.println("Repeat docs: " + repeatDocs);

    if (verbose) {
        InfoStream.setDefault(new PrintStreamInfoStream(System.out));
    }

    final IndexWriterConfig iwc = new IndexWriterConfig(a);

    if (indexSortField != null) {
        iwc.setIndexSort(new Sort(new SortField(indexSortField, indexSortType)));
    }

    if (mode == Mode.UPDATE) {
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    } else {
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    }

    iwc.setMaxBufferedDocs(maxBufferedDocs);
    iwc.setRAMBufferSizeMB(ramBufferSizeMB);

    // So flushed segments do/don't use CFS:
    iwc.setUseCompoundFile(useCFS);

    final AtomicBoolean indexingFailed = new AtomicBoolean();

    iwc.setMergeScheduler(getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle));
    iwc.setMergePolicy(getMergePolicy(mergePolicy, useCFS));

    // Keep all commit points:
    if (doDeletions || doForceMerge) {
        iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    }

    final Codec codec = new Lucene62Codec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {
            return PostingsFormat.forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat);
        }

        private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName(facetDVFormatName);
        //private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42");
        //private final DocValuesFormat diskDVFormat = DocValuesFormat.forName("Disk");
        //        private final DocValuesFormat lucene45DVFormat = DocValuesFormat.forName("Lucene45");
        private final DocValuesFormat directDVFormat = DocValuesFormat.forName("Direct");

        @Override
        public DocValuesFormat getDocValuesFormatForField(String field) {
            if (facetFields.contains(field) || field.equals("$facets")) {
                return facetsDVFormat;
                //} else if (field.equals("$facets_sorted_doc_values")) {
                //return diskDVFormat;
            } else {
                // Use default DVFormat for all else:
                // System.out.println("DV: field=" + field + " format=" + super.getDocValuesFormatForField(field));
                return super.getDocValuesFormatForField(field);
            }
        }
    };

    iwc.setCodec(codec);

    System.out.println("IW config=" + iwc);

    IndexWriter w = new IndexWriter(dir, iwc);

    System.out.println("Index has " + w.maxDoc() + " docs");

    final TaxonomyWriter taxoWriter;
    if (facetFields.isEmpty() == false) {
        taxoWriter = new DirectoryTaxonomyWriter(od.open(Paths.get(args.getString("-indexPath"), "facets")),
                IndexWriterConfig.OpenMode.CREATE);
    } else {
        taxoWriter = null;
    }

    // Fixed seed so group field values are always consistent:
    final Random random = new Random(17);

    LineFileDocs lineFileDocs = new LineFileDocs(lineFile, repeatDocs, storeBody, tvsBody, bodyPostingsOffsets,
            false, taxoWriter, facetFields, facetsConfig, addDVFields);

    float docsPerSecPerThread = -1f;
    //float docsPerSecPerThread = 100f;

    IndexThreads threads = new IndexThreads(random, w, indexingFailed, lineFileDocs, numThreads, docCountLimit,
            addGroupingFields, printDPS, mode, docsPerSecPerThread, null, nrtEverySec, randomDocIDMax);

    System.out.println("\nIndexer: start");
    final long t0 = System.currentTimeMillis();

    threads.start();

    while (!threads.done() && indexingFailed.get() == false) {
        Thread.sleep(100);

        // Commits once per minute on average:
        if (doRandomCommit && random.nextInt(600) == 17) {
            System.out.println("Indexer: now commit");
            long commitStartNS = System.nanoTime();
            w.commit();
            System.out.println(String.format(Locale.ROOT, "Indexer: commit took %.1f msec",
                    (System.nanoTime() - commitStartNS) / 1000000.));
        }
    }

    threads.stop();

    final long t1 = System.currentTimeMillis();
    System.out.println("\nIndexer: indexing done (" + (t1 - t0) + " msec); total " + w.maxDoc() + " docs");
    // if we update we can not tell how many docs
    if (threads.failed.get()) {
        throw new RuntimeException("exceptions during indexing");
    }
    if (mode != Mode.UPDATE && docCountLimit != -1 && w.maxDoc() != docCountLimit) {
        throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit);
    }

    final Map<String, String> commitData = new HashMap<String, String>();

    if (waitForMerges) {
        w.close();
        IndexWriterConfig iwc2 = new IndexWriterConfig(a);
        iwc2.setMergeScheduler(
                getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle));
        iwc2.setMergePolicy(getMergePolicy(mergePolicy, useCFS));
        iwc2.setCodec(codec);
        iwc2.setUseCompoundFile(useCFS);
        iwc2.setMaxBufferedDocs(maxBufferedDocs);
        iwc2.setRAMBufferSizeMB(ramBufferSizeMB);
        if (indexSortField != null) {
            iwc2.setIndexSort(new Sort(new SortField(indexSortField, indexSortType)));
        }

        w = new IndexWriter(dir, iwc2);
        long t2 = System.currentTimeMillis();
        System.out.println("\nIndexer: waitForMerges done (" + (t2 - t1) + " msec)");
    }

    if (waitForCommit) {
        commitData.put("userData", "multi");
        w.setLiveCommitData(commitData.entrySet());
        long t2 = System.currentTimeMillis();
        w.commit();
        long t3 = System.currentTimeMillis();
        System.out.println("\nIndexer: commit multi (took " + (t3 - t2) + " msec)");
    } else {
        w.rollback();
        w = null;
    }

    if (doForceMerge) {
        long forceMergeStartMSec = System.currentTimeMillis();
        w.forceMerge(1);
        long forceMergeEndMSec = System.currentTimeMillis();
        System.out.println(
                "\nIndexer: force merge done (took " + (forceMergeEndMSec - forceMergeStartMSec) + " msec)");

        commitData.put("userData", "single");
        w.setLiveCommitData(commitData.entrySet());
        w.commit();
        final long t5 = System.currentTimeMillis();
        System.out.println("\nIndexer: commit single done (took " + (t5 - forceMergeEndMSec) + " msec)");
    }

    if (doDeletions) {
        final long t5 = System.currentTimeMillis();
        // Randomly delete 5% of the docs
        final Set<Integer> deleted = new HashSet<Integer>();
        final int maxDoc = w.maxDoc();
        final int toDeleteCount = (int) (maxDoc * 0.05);
        System.out.println("\nIndexer: delete " + toDeleteCount + " docs");
        while (deleted.size() < toDeleteCount) {
            final int id = random.nextInt(maxDoc);
            if (!deleted.contains(id)) {
                deleted.add(id);
                w.deleteDocuments(new Term("id", LineFileDocs.intToID(id)));
            }
        }
        final long t6 = System.currentTimeMillis();
        System.out.println("\nIndexer: deletes done (took " + (t6 - t5) + " msec)");

        commitData.put("userData", doForceMerge ? "delsingle" : "delmulti");
        w.setLiveCommitData(commitData.entrySet());
        w.commit();
        final long t7 = System.currentTimeMillis();
        System.out.println("\nIndexer: commit delmulti done (took " + (t7 - t6) + " msec)");

        if (doUpdate || w.numDocs() != maxDoc - toDeleteCount) {
            throw new RuntimeException(
                    "count mismatch: w.numDocs()=" + w.numDocs() + " but expected " + (maxDoc - toDeleteCount));
        }
    }

    if (taxoWriter != null) {
        System.out.println("Taxonomy has " + taxoWriter.getSize() + " ords");
        taxoWriter.commit();
        taxoWriter.close();
    }

    final long tCloseStart = System.currentTimeMillis();
    if (w != null) {
        w.close();
        w = null;
    }
    if (waitForCommit) {
        System.out.println("\nIndexer: at close: " + SegmentInfos.readLatestCommit(dir));
        System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) + " msec");
    }

    dir.close();
    final long tFinal = System.currentTimeMillis();
    System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed());

    final long indexingTime;
    if (waitForCommit) {
        indexingTime = tFinal - t0;
        System.out.println("\nIndexer: finished (" + indexingTime + " msec)");
    } else {
        indexingTime = t1 - t0;
        System.out.println("\nIndexer: finished (" + indexingTime + " msec), excluding commit");
    }
    System.out.println(
            "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / (indexingTime / 3600000.))
                    + " GB/hour plain text");
}

From source file:perf.PrintPerFieldHeapUsage.java

License:Apache License

public static void main(String[] args) throws IOException {
    Directory dir = FSDirectory.open(Paths.get("fields"));

    int fieldUpto;
    IndexWriterConfig iwc;//  w  ww. j ava 2  s. c  om
    IndexWriter w;
    long t0;
    IndexReader r;

    // Stored field:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new StoredField("f" + fieldUpto, "text" + i));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StoredField: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    // Indexed StringField:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new StringField("f" + fieldUpto, "text" + i, Field.Store.NO));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StringField: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    // Numeric DV field:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField("f" + fieldUpto, i));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT,
            "Took %.1f sec; bytes per unique NumericDocValuesField, latent: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    // Now force lazy loading of all the DV fields:
    for (int i = 0; i < FIELD_COUNT; i++) {
        MultiDocValues.getNumericValues(r, "f" + i);
    }
    System.out.println(String.format(Locale.ROOT, "Bytes per unique NumericDocValuesField, loaded: %.1f",
            (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    // Sorted DV field:
    iwc = new IndexWriterConfig(new WhitespaceAnalyzer());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    w = new IndexWriter(dir, iwc);

    fieldUpto = 0;
    t0 = System.nanoTime();
    for (int i = 0; i < FIELD_COUNT; i++) {
        Document doc = new Document();
        doc.add(new SortedDocValuesField("f" + fieldUpto, new BytesRef("text" + i)));
        fieldUpto++;
        w.addDocument(doc);
    }

    w.forceMerge(1);
    w.close();

    r = DirectoryReader.open(dir);
    System.out.println(String.format(Locale.ROOT,
            "Took %.1f sec; bytes per unique SortedDocValuesField, latent: %.1f",
            (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    // Now force lazy loading of all the DV fields:
    for (int i = 0; i < FIELD_COUNT; i++) {
        MultiDocValues.getSortedValues(r, "f" + i);
    }
    System.out.println(String.format(Locale.ROOT, "Bytes per unique SortedDocValuesField, loaded: %.1f",
            (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT)));
    r.close();

    dir.close();
}

From source file:semanticRelatedness.MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();// www .java 2s.  co  m
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new DefaultSimilarity());
        //iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:tool.export.ExportToLucene.java

License:Open Source License

public static void main(String[] args) throws IOException {
    String usage = "tool.export.ExportToLucene" + " [-index INDEX_PATH] [-frames FRAMES_PATH]\n\n"
            + "This indexes the frames in FRAMES_PATH, creating a Lucene index" + "in INDEX_PATH";

    String indexPath = "index";
    String framesPath = null;/*from w  w w.  j  av a  2 s.  c  o  m*/
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];
            i++;
        } else if ("-frames".equals(args[i])) {
            framesPath = args[i + 1];
            i++;
        }
    }

    if (framesPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(indexPath));
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer);

    iwc.setOpenMode(OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(3072.0);

    IndexWriter writer = new IndexWriter(dir, iwc);

    List<PairOfWritables<Text, DoubleWritable>> bigrams = SequenceFileUtils.readDirectory(new Path(framesPath));

    for (PairOfWritables<Text, DoubleWritable> bigram : bigrams) {
        String[] slots = bigram.getLeftElement().toString().trim().split("\t");
        double probability = bigram.getRightElement().get();

        Document doc = new Document();

        for (int i = 0; i < slots.length; i++) {
            String slot = slots[i];
            if (i == 0) {
                slot = slot.substring(2, slot.length() - 1);
            } else {
                slot = slot.substring(1, slot.length() - 1);
            }

            String[] slotParts = slot.split(",");
            if (slotParts.length >= 3) {
                String slotRelation = slotParts[0];
                StringBuilder slotValue = new StringBuilder();

                String separator = "";
                for (int j = 1; j < slotParts.length - 1; j++) {
                    slotValue.append(separator);
                    slotValue.append(slotParts[j]);
                    separator = ",";
                }

                Field slotField = new TextField(slotRelation,
                        slotValue.toString().substring(1, slotValue.toString().length() - 1), Field.Store.YES);
                doc.add(slotField);
            }
        }

        Field probabilityField = new DoubleField("probability", probability, Field.Store.YES);
        doc.add(probabilityField);

        writer.addDocument(doc);
    }

    writer.forceMerge(1);
    writer.close();
}