Example usage for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB.

Prototype

@Override
    public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB)

Source Link

Usage

From source file:MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();/* ww  w .  ja  va2s  . c  o m*/
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        Analyzer analyzer = new WikipediaAnalyzer();
        //         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:MakeLuceneIndexPreprocessed.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";

    String inputLuceneIndexName = "2013-06-18-lucene-gab";
    String luceneIndexName = "2013-06-18-lucene-gab-standard";

    System.currentTimeMillis();/*from   w w  w. j  av  a2s  .  com*/

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-inputluceneindex"))
            inputLuceneIndexName = args[++i];

        if (args[i].equals("-outputluceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

    }
    String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt";
    String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt";
    String logPath = baseDir + inputLuceneIndexName + ".log";

    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();
    logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0");

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);
        //         iwc.setSimilarity(new ESASimilarity());

        IndexWriter writer = new IndexWriter(dir, iwc);

        int iArticleCount = 0;
        int iSkippedPageCount = 0;

        BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath));
        BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath));
        String lineText = rawTextReader.readLine();
        String lineLinks = artikelInLinksReader.readLine();

        while (lineText != null) {
            //            String title = lineText.substring(0, lineText.indexOf("\t")); 
            //            while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){
            //               lineLinks = artikelInLinksReader.readLine();
            //            }
            int endOfTitle = lineText.indexOf("\t");
            String title = lineText.substring(0, endOfTitle);

            if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) {
                ++iArticleCount;
                Document doc = new Document();
                doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                        + lineText.substring(endOfTitle + 1), Field.Store.NO));
                //               System.out.println(title + " " + 
                //               title + " " + 
                //               title + " " + 
                //               title + " " +
                //               lineText.substring(endOfTitle+1));
                writer.addDocument(doc);

                if (iArticleCount % 1000 == 0) {
                    writer.commit();
                    logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount
                            + " iSkippedPageCount: " + iSkippedPageCount);
                    logger.flush();
                }
            }
            lineText = rawTextReader.readLine();
            lineLinks = artikelInLinksReader.readLine();
        }
        rawTextReader.close();
        artikelInLinksReader.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:IndexAndSearchOpenStreetMaps1D.java

License:Apache License

private static void createIndex() throws IOException {

    long t0 = System.nanoTime();

    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    int BUFFER_SIZE = 1 << 16; // 64K
    InputStream is = Files
            .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

    Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : "")));

    IndexWriterConfig iwc = new IndexWriterConfig(null);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    //iwc.setMaxBufferedDocs(109630);
    //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setMergePolicy(new LogDocMergePolicy());
    iwc.setMergeScheduler(new SerialMergeScheduler());
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    IndexWriter w = new IndexWriter(dir, iwc);

    int count = 0;
    byte[] scratch = new byte[4];
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }//from   ww w . j av a2 s.  c  om

        String[] parts = line.split(",");
        //long id = Long.parseLong(parts[0]);
        int lat = (int) (1000000. * Double.parseDouble(parts[1]));
        //int lon = (int) (1000000. * Double.parseDouble(parts[2]));
        Document doc = new Document();
        if (USE_NF) {
            doc.add(new LegacyIntField("latnum", lat, Field.Store.NO));
            //doc.add(new LongField("lonnum", lon, Field.Store.NO));
        } else {
            doc.add(new IntPoint("lat", lat));
            //doc.add(new SortedNumericDocValuesField("lon", lon));
        }
        w.addDocument(doc);
        count++;
        if (count % 1000000 == 0) {
            System.out.println(count + "...");
        }
    }
    //w.forceMerge(1);
    w.commit();
    System.out.println(w.maxDoc() + " total docs");

    w.close();
    long t1 = System.nanoTime();
    System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index");
}

From source file:IndexTaxis.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path indexPath = Paths.get(args[0]);
    Directory dir = FSDirectory.open(indexPath);
    int threadCount = Integer.parseInt(args[1]);
    Path docsPath = Paths.get(args[2]);

    IndexWriterConfig iwc = new IndexWriterConfig();
    //System.out.println("NOW SET INFO STREAM");
    iwc.setRAMBufferSizeMB(1024.);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).disableAutoIOThrottle();

    final IndexWriter w = new IndexWriter(dir, iwc);

    BufferedInputStream docs = new BufferedInputStream(Files.newInputStream(docsPath, StandardOpenOption.READ));

    // parse the header fields
    List<String> fieldsList = new ArrayList<>();
    StringBuilder builder = new StringBuilder();
    while (true) {
        int x = docs.read();
        if (x == -1) {
            throw new IllegalArgumentException(
                    "hit EOF while trying to read CSV header; are you sure you have the right CSV file!");
        }//from   w  w  w . j a  v  a2  s . c  o m
        byte b = (byte) x;
        if (b == NEWLINE) {
            fieldsList.add(builder.toString());
            break;
        } else if (b == COMMA) {
            fieldsList.add(builder.toString());
            builder.setLength(0);
        } else {
            // this is OK because headers are all ascii:
            builder.append((char) b);
        }
    }

    final String[] fields = fieldsList.toArray(new String[fieldsList.size()]);

    Thread[] threads = new Thread[threadCount];

    final AtomicInteger docCounter = new AtomicInteger();
    final AtomicLong bytesCounter = new AtomicLong();

    startNS = System.nanoTime();

    for (int i = 0; i < threadCount; i++) {
        final int threadID = i;
        threads[i] = new Thread() {
            @Override
            public void run() {
                try {
                    _run();
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }

            private void _run() throws IOException {
                while (true) {
                    Chunk chunk = readChunk(docs);
                    if (chunk == null) {
                        break;
                    }
                    indexOneChunk(fields, chunk, w, docCounter, bytesCounter);
                }
            }
        };
        threads[i].start();
    }

    for (int i = 0; i < threadCount; i++) {
        threads[i].join();
    }
    System.out.println("Indexing done; now close");

    w.close();
    docs.close();
}

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;//from  w  ww .j  a v a 2s  . c  o  m
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}

From source file:biospectra.index.Indexer.java

License:Apache License

private void initialize(File indexPath, int kmerSize, boolean minStrandKmer, Similarity similarity,
        int workerThreads, int ramBufferSize) throws Exception {
    if (!indexPath.exists()) {
        indexPath.mkdirs();//w  ww  . ja  v a  2 s .  c o m
    }

    if (indexPath.exists()) {
        cleanUpDirectory(indexPath);
    }

    this.indexPath = indexPath;
    this.minStrandKmer = minStrandKmer;
    this.analyzer = new KmerIndexAnalyzer(kmerSize, minStrandKmer);
    Directory dir = new MMapDirectory(this.indexPath.toPath());
    IndexWriterConfig config = new IndexWriterConfig(this.analyzer);
    if (similarity != null) {
        config.setSimilarity(similarity);
    }

    this.workerThreads = workerThreads;

    if (ramBufferSize > 0) {
        config.setRAMBufferSizeMB(ramBufferSize);
    }

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    this.indexWriter = new IndexWriter(dir, config);

    this.executor = new BlockingExecutor(this.workerThreads, this.workerThreads * 2);

    for (int i = 0; i < this.workerThreads; i++) {
        Document doc = new Document();
        Field filenameField = new StringField(IndexConstants.FIELD_FILENAME, "", Field.Store.YES);
        Field headerField = new StringField(IndexConstants.FIELD_HEADER, "", Field.Store.YES);
        Field sequenceDirectionField = new StringField(IndexConstants.FIELD_SEQUENCE_DIRECTION, "",
                Field.Store.YES);
        Field taxonTreeField = new StringField(IndexConstants.FIELD_TAXONOMY_TREE, "", Field.Store.YES);
        Field sequenceField = new TextField(IndexConstants.FIELD_SEQUENCE, "", Field.Store.NO);

        doc.add(filenameField);
        doc.add(headerField);
        doc.add(sequenceDirectionField);
        doc.add(taxonTreeField);
        doc.add(sequenceField);

        this.freeQueue.offer(doc);
    }
}

From source file:cn.hbu.cs.esearch.index.DiskSearchIndex.java

License:Apache License

/**
 * Opens an index modifier.//from  w  w  w.  j  av a  2 s.  com
 * @param analyzer Analyzer
 * @return IndexModifer instance
 */
@Override
public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException {
    if (_indexWriter != null) {
        return _indexWriter;
    }

    Directory directory = _dirMgr.getDirectory(true);
    log.info("opening index writer at: " + _dirMgr.getPath());

    EsearchMergePolicy mergePolicy = new EsearchMergePolicy();
    mergePolicy.setMergePolicyParams(_mergePolicyParams);

    // hao: autocommit is set to false with this constructor
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    config.setOpenMode(OpenMode.CREATE_OR_APPEND);
    _deletionPolicy = new ZoieIndexDeletionPolicy();
    config.setIndexDeletionPolicy(_deletionPolicy);
    config.setMergeScheduler(_mergeScheduler);
    config.setMergePolicy(mergePolicy);
    config.setReaderPooling(false);
    if (similarity != null) {
        config.setSimilarity(similarity);
    }
    config.setRAMBufferSizeMB(5);
    IndexWriter idxWriter = new IndexWriter(directory, config);

    // we need retrieve deletionPolicy from IndexWriter since deletionPolicy is deep cloned
    _deletionPolicy = (ZoieIndexDeletionPolicy) (idxWriter.getConfig().getIndexDeletionPolicy());
    _indexWriter = idxWriter;
    return idxWriter;
}

From source file:cn.hbu.cs.esearch.index.RAMSearchIndex.java

License:Apache License

@Override
public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException {

    if (_indexWriter != null) {
        return _indexWriter;
    }/*w  ww. ja v a  2 s. c o m*/

    EsearchMergePolicy mergePolicy = new EsearchMergePolicy();
    mergePolicy.setMergePolicyParams(_mergePolicyParams);
    mergePolicy.setUseCompoundFile(false);

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    config.setOpenMode(OpenMode.CREATE_OR_APPEND);
    config.setMergeScheduler(_mergeScheduler);
    config.setMergePolicy(mergePolicy);
    config.setReaderPooling(false);
    if (similarity != null) {
        config.setSimilarity(similarity);
    }
    config.setRAMBufferSizeMB(3);

    IndexWriter idxWriter = new IndexWriter(_directory, config);
    _indexWriter = idxWriter;
    return idxWriter;
}

From source file:com.aliasi.lingmed.medline.IndexMedline.java

License:Lingpipe license

/**
 * Run the command.  See class documentation above for details on
 * arguments and behavior.//from w w  w. j  a  va2  s.co  m
 */
public void run() {
    System.out.println("start run");
    try {
        File[] files = getLaterFiles(mDistDir);
        System.out.println("Total files to process: " + files.length);
        System.out.println("File names: " + java.util.Arrays.asList(files));
        //            if (mLogger.isDebugEnabled())
        //                mLogger.debug("File names: " + java.util.Arrays.asList(files));
        if (files.length > 0) {
            MedlineParser parser = new MedlineParser(true); // true = save raw XML

            Directory fsDir = FSDirectory.open(mIndex);
            IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_36, mCodec.getAnalyzer());
            iwConf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            iwConf.setRAMBufferSizeMB(RAM_BUF_SIZE);
            if (sIsBaseline) {
                LogDocMergePolicy ldmp = new LogDocMergePolicy();
                ldmp.setMergeFactor(MERGE_FACTOR_HI);
                iwConf.setMergePolicy(ldmp);
            }
            IndexWriter indexWriter = new IndexWriter(fsDir, iwConf);

            for (File file : files) {
                System.out.println("processing file: " + file);
                MedlineIndexer indexer = new MedlineIndexer(indexWriter, mCodec);
                parser.setHandler(indexer);
                parseFile(parser, file);
                indexer.close();
                recordFile(indexWriter, file.getName());
                System.out.println("completed processing file: " + file);
            }
            System.out.println("All files parsed, now optimize index");
            indexWriter.forceMerge(1);
            indexWriter.commit();
            indexWriter.close();
        }
        System.out.println("Processing complete.");
    } catch (Exception e) {
        //            mLogger.warn("Unexpected Exception: "+e.getMessage());
        //            mLogger.warn("stack trace: "+Logging.logStackTrace(e));
        //            mLogger.warn("Aborting this run");
        IllegalStateException e2 = new IllegalStateException(e.getMessage());
        e2.setStackTrace(e.getStackTrace());
        throw e2;
    }
}

From source file:com.baidu.rigel.biplatform.tesseract.isservice.index.service.IndexWriterFactory.java

License:Open Source License

/**
 * /*  ww  w.  ja  v  a  2s. c om*/
 * getIndexWriter
 * 
 * @param idxPath
 *            
 * @return IndexWriter
 * @throws IOException
 *             IO
 */
public static synchronized IndexWriter getIndexWriter(String idxPath) throws IOException {
    LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_BEGIN, "getIndexWriter",
            "[idxPath:" + idxPath + "]"));
    IndexWriter indexWriter = null;
    if (INSTANCE.idxWriterMaps.containsKey(idxPath)) {
        indexWriter = INSTANCE.idxWriterMaps.get(idxPath);
        LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_PROCESS_NO_PARAM, "getIndexWriter",
                "return exist IndexWriter "));
    } else {
        File indexFile = new File(idxPath);
        Directory directory = FSDirectory.open(indexFile);
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_1,
                new StandardAnalyzer());
        indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        indexWriterConfig.setRAMBufferSizeMB(64.0);
        indexWriterConfig.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
        indexWriter = new IndexWriter(directory, indexWriterConfig);

        INSTANCE.idxWriterMaps.put(idxPath, indexWriter);
        LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_PROCESS_NO_PARAM, "getIndexWriter",
                "create new IndexWriter "));
    }
    LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_END, "getIndexWriter",
            "[idxPath:" + idxPath + "]"));
    return indexWriter;
}