Example usage for org.apache.lucene.index IndexWriter commit

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter commit.

Prototype

@Override
public final long commit() throws IOException

Source Link

Document

Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.

Usage

From source file:WriteIndex.java

License:Apache License

/**
 * @param args// w  w  w.  j  a va2s  . co  m
 */
public static void main(String[] args) throws IOException {

    File docs = new File("documents");
    File indexDir = new File(INDEX_DIRECTORY);

    Directory directory = FSDirectory.open(indexDir);

    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    writer.deleteAll();

    for (File file : docs.listFiles()) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        ParseContext context = new ParseContext();
        Parser parser = new AutoDetectParser();
        InputStream stream = new FileInputStream(file);
        try {
            parser.parse(stream, handler, metadata, context);
        } catch (TikaException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } finally {
            stream.close();
        }

        String text = handler.toString();
        String fileName = file.getName();

        Document doc = new Document();
        doc.add(new Field("file", fileName, Store.YES, Index.NO));

        for (String key : metadata.names()) {
            String name = key.toLowerCase();
            String value = metadata.get(key);

            if (StringUtils.isBlank(value)) {
                continue;
            }

            if ("keywords".equalsIgnoreCase(key)) {
                for (String keyword : value.split(",?(\\s+)")) {
                    doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED));
                }
            } else if ("title".equalsIgnoreCase(key)) {
                doc.add(new Field(name, value, Store.YES, Index.ANALYZED));
            } else {
                doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED));
            }
        }
        doc.add(new Field("text", text, Store.NO, Index.ANALYZED));
        writer.addDocument(doc);

    }

    writer.commit();
    writer.deleteUnusedFiles();

    System.out.println(writer.maxDoc() + " documents written");
}

From source file:MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();//from  w w  w .j a v a  2 s  . c  o m
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        Analyzer analyzer = new WikipediaAnalyzer();
        //         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:MakeLuceneIndexPreprocessed.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";

    String inputLuceneIndexName = "2013-06-18-lucene-gab";
    String luceneIndexName = "2013-06-18-lucene-gab-standard";

    System.currentTimeMillis();/*from  w  w w .j a v  a  2  s . c o  m*/

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-inputluceneindex"))
            inputLuceneIndexName = args[++i];

        if (args[i].equals("-outputluceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

    }
    String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt";
    String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt";
    String logPath = baseDir + inputLuceneIndexName + ".log";

    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();
    logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0");

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);
        //         iwc.setSimilarity(new ESASimilarity());

        IndexWriter writer = new IndexWriter(dir, iwc);

        int iArticleCount = 0;
        int iSkippedPageCount = 0;

        BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath));
        BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath));
        String lineText = rawTextReader.readLine();
        String lineLinks = artikelInLinksReader.readLine();

        while (lineText != null) {
            //            String title = lineText.substring(0, lineText.indexOf("\t")); 
            //            while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){
            //               lineLinks = artikelInLinksReader.readLine();
            //            }
            int endOfTitle = lineText.indexOf("\t");
            String title = lineText.substring(0, endOfTitle);

            if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) {
                ++iArticleCount;
                Document doc = new Document();
                doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                        + lineText.substring(endOfTitle + 1), Field.Store.NO));
                //               System.out.println(title + " " + 
                //               title + " " + 
                //               title + " " + 
                //               title + " " +
                //               lineText.substring(endOfTitle+1));
                writer.addDocument(doc);

                if (iArticleCount % 1000 == 0) {
                    writer.commit();
                    logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount
                            + " iSkippedPageCount: " + iSkippedPageCount);
                    logger.flush();
                }
            }
            lineText = rawTextReader.readLine();
            lineLinks = artikelInLinksReader.readLine();
        }
        rawTextReader.close();
        artikelInLinksReader.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:IndexAndSearchOpenStreetMaps1D.java

License:Apache License

private static void createIndex() throws IOException {

    long t0 = System.nanoTime();

    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    int BUFFER_SIZE = 1 << 16; // 64K
    InputStream is = Files
            .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

    Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : "")));

    IndexWriterConfig iwc = new IndexWriterConfig(null);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    //iwc.setMaxBufferedDocs(109630);
    //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    iwc.setRAMBufferSizeMB(256.0);/*from www  .j  a v a  2 s. c  o m*/
    iwc.setMergePolicy(new LogDocMergePolicy());
    iwc.setMergeScheduler(new SerialMergeScheduler());
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    IndexWriter w = new IndexWriter(dir, iwc);

    int count = 0;
    byte[] scratch = new byte[4];
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] parts = line.split(",");
        //long id = Long.parseLong(parts[0]);
        int lat = (int) (1000000. * Double.parseDouble(parts[1]));
        //int lon = (int) (1000000. * Double.parseDouble(parts[2]));
        Document doc = new Document();
        if (USE_NF) {
            doc.add(new LegacyIntField("latnum", lat, Field.Store.NO));
            //doc.add(new LongField("lonnum", lon, Field.Store.NO));
        } else {
            doc.add(new IntPoint("lat", lat));
            //doc.add(new SortedNumericDocValuesField("lon", lon));
        }
        w.addDocument(doc);
        count++;
        if (count % 1000000 == 0) {
            System.out.println(count + "...");
        }
    }
    //w.forceMerge(1);
    w.commit();
    System.out.println(w.maxDoc() + " total docs");

    w.close();
    long t1 = System.nanoTime();
    System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index");
}

From source file:action.indexing.IndexingTest.java

License:Apache License

public void testDeleteBeforeOptimize() throws IOException {
    IndexWriter writer = getWriter();
    assertEquals(2, writer.numDocs()); //A
    writer.deleteDocuments(new Term("id", "1")); //B
    writer.commit();
    assertTrue(writer.hasDeletions()); //1
    assertEquals(2, writer.maxDoc()); //2
    assertEquals(1, writer.numDocs()); //2   
    writer.close();/*www. ja  v  a2s .  c o m*/
}

From source file:action.indexing.IndexingTest.java

License:Apache License

public void testDeleteAfterOptimize() throws IOException {
    IndexWriter writer = getWriter();
    assertEquals(2, writer.numDocs());/*from w  w w  .  j  av  a 2 s .com*/
    writer.deleteDocuments(new Term("id", "1"));
    writer.optimize(); //3
    writer.commit();
    assertFalse(writer.hasDeletions());
    assertEquals(1, writer.maxDoc()); //C
    assertEquals(1, writer.numDocs()); //C    
    writer.close();
}

From source file:application.ReviewDocumentIndexer.java

License:Open Source License

/**
 * @param args/*from   ww  w  . j  a v  a  2  s . c  om*/
 */
@SuppressWarnings("deprecation")
public static void main(String[] args) {
    // Parse command line arguments. Exit program is provided arguments are insufficient
    ReviewDocumentIndexer indexer = new ReviewDocumentIndexer(args);
    if (indexer == null)
        return;

    // Open a new index
    IndexWriter index = null;
    try {
        index = new IndexWriter(new SimpleFSDirectory(new File(Paths.luceneIndex)),
                new ReviewTextAnalyzer(indexer), indexer.new_index ? true : false, MaxFieldLength.UNLIMITED);
        if (indexer.pause_every > 2) {
            index.setMaxBufferedDocs(indexer.pause_every);
        }
        index.setMaxMergeDocs(Config.maxMergeDocs);
        index.setMergeFactor(Config.mergeFactor);
    } catch (CorruptIndexException e) {
        AppLogger.error.log(Level.SEVERE,
                "Lucene detected an inconsistency upon opening the index located at " + Paths.luceneIndex);
        throw new RuntimeException("Exiting application", e);
    } catch (LockObtainFailedException e) {
        AppLogger.error.log(Level.SEVERE,
                "Index located at " + Paths.luceneIndex + " is already open by another Lucene process");
        throw new RuntimeException("Exiting application", e);
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE, "Could not access location " + Paths.luceneIndex);
        throw new RuntimeException("Exiting application", e);
    }

    // Load a number of reviews from database
    NumberFormat docIdFormat = TokenListsCollector.defaultDocIdFormat();
    try {
        DatabaseReviewCollection reviews = new DatabaseReviewCollection(indexer.pause_every);
        reviews.setLimits(indexer.min_reviewid, indexer.stop_after);
        int indexed_counter = 0;

        while (reviews.hasNextSegment()) {

            System.out.print(Calendar.getInstance().getTime().toGMTString());

            System.out.print(" Loading from DB... ");
            reviews.loadNextSegment();
            Iterator<Review> reviewsIterator = reviews.getIterator();

            System.out.print(" Indexing... ");
            while (reviewsIterator.hasNext()) {
                DatabaseReview dbr = (DatabaseReview) reviewsIterator.next();
                int dbr_id = dbr.getReviewid();
                int dbr_rating = dbr.getRating();

                try {
                    indexer.theReviewId.set(dbr_id);
                    indexer.theStats.setCurrent(dbr_id, dbr_rating);

                    index.addDocument(dbr.getDocumentForIndexing());
                    indexed_counter++;

                    // Also, keep track of the rating and length of this review
                    indexer.theStats.storeCurrent();

                } catch (CorruptIndexException e) {
                    AppLogger.error.log(Level.SEVERE,
                            "Lucene detected an inconsistency upon saving review #"
                                    + Integer.toString(dbr.getReviewid()) + "to the index located at "
                                    + Paths.luceneIndex);
                    return;
                } catch (IOException e) {
                    AppLogger.error.log(Level.WARNING,
                            "Review #" + Integer.toString(dbr.getReviewid()) + " could not be indexed");
                }
            }

            // Backup everything
            System.out.print("Indexed " + indexed_counter + " reviews total. ");
            if (indexer.pause_every > 0) {
                System.out.print("Saving tokenlists... ");
                indexer.theTokenLists.writeNextFile(docIdFormat);

                System.out.print("Saving state... ");
                try {
                    index.commit();
                    indexer.saveState();
                } catch (CorruptIndexException e) {
                    AppLogger.error.log(Level.SEVERE, "Committing index changes failed on review #"
                            + indexer.theReviewId.get() + "due to CorruptIndexException");
                    return;
                } catch (IOException e) {
                    AppLogger.error.log(Level.WARNING, "Committing index changes failed on review #"
                            + indexer.theReviewId.get() + "due to IOException");
                }
            }
            System.out.print("DONE\n");

            reviews.reset();
        }
    } catch (SQLException e) {
        AppLogger.error.log(Level.SEVERE,
                "An exception occured while trying to access the database.\n" + e.getMessage());
        return;
    }

    try {
        index.close();
        indexer.backupIndex();
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    System.err.println("Indexing successfully completed!");
    return;
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the temporary index that provides a lookup of checklist bank id to
 * GUID//from  w  ww . j a va 2  s. c o  m
 */
private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception {
    System.out.println("Starting to create the tmp guid index...");
    IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true);
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(
            new FileReader(cbExportFile), '\t', '"', '/', 1);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        Document doc = new Document();
        String id = values[POS_ID];
        String guid = values[POS_LSID];
        doc.add(new StringField("id", id, Store.YES));
        if (StringUtils.isEmpty(id))
            guid = id;

        doc.add(new StoredField("guid", guid));
        iw.addDocument(doc);
    }
    System.out.println("Finished writing the tmp guid index...");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    //As of lucene 4.0 all IndexReaders are read only
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid"))));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private void indexALA(IndexWriter iw, String file, String synonymFile) throws Exception {
    int records = 0;
    long time = System.currentTimeMillis();
    au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t',
            '"', '\\', 1);
    for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {

        String lsid = values[POS_LSID];
        String id = values[POS_ID];
        String rank = values[POS_RANK];
        int rankId = -1;
        try {//from w w w  .  j  a v a 2s .c  om
            rankId = Integer.parseInt(values[POS_RANK_ID]);
        } catch (Exception e) {
        }

        String acceptedValues = values[POS_ACC_LSID];
        float boost = 1.0f;
        //give the major ranks a larger boost
        if (rankId % 1000 == 0) {
            boost = 5.0f;
        }
        //give non-col concepts a higher boost
        String source = values[POS_SRC];
        if (!source.trim().equals("") && !source.equalsIgnoreCase("CoL")) {
            boost = boost * 2;
        }

        Document doc = createALAIndexDocument(values[POS_SCI_NAME], id, lsid, values[POS_RANK_ID],
                values[POS_RANK], values[POS_K], values[POS_KID], values[POS_P], values[POS_PID], values[POS_C],
                values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G],
                values[POS_GID], values[POS_S], values[POS_SID], values[POS_LFT], values[POS_RGT],
                acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], boost);

        //add the excluded information if applicable
        if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) {
            doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(),
                    SynonymType.EXCLUDES.getId().toString(), Store.YES));
        }
        if (doc != null) {
            iw.addDocument(doc);
            records++;
            if (records % 100000 == 0) {
                log.info("Processed " + records + " in " + (System.currentTimeMillis() - time) + " msecs");
            }
        }
    }
    addExtraALAConcept(iw, extraALAConcepts);
    //add the synonyms
    addALASyonyms(iw, synonymFile);
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    log.info("Lucene index created - processed a total of " + records + " records in "
            + (System.currentTimeMillis() - time) + " msecs ");
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Indexes an IRMNG export for use in homonym resolution.
 *
 * @param iw//from  w w w. j  a  v a  2 s . co m
 * @param irmngExport
 * @throws Exception
 */
void indexIRMNG(IndexWriter iw, String irmngExport, RankType rank) throws Exception {
    log.info("Creating IRMNG index ...");
    File file = new File(irmngExport);
    if (file.exists()) {
        CSVReader reader = new CSVReader(new FileReader(file), '\t', '"', '~');// CSVReader.build(file,"UTF-8", "\t", 0);
        int count = 0;
        String[] values = null;
        while ((values = reader.readNext()) != null) {
            Document doc = new Document();
            if (values != null && values.length >= 7) {
                doc.add(new TextField(RankType.KINGDOM.getRank(), values[0], Store.YES));
                doc.add(new TextField(RankType.PHYLUM.getRank(), values[1], Store.YES));
                doc.add(new TextField(RankType.CLASS.getRank(), values[2], Store.YES));
                doc.add(new TextField(RankType.ORDER.getRank(), values[3], Store.YES));
                doc.add(new TextField(RankType.FAMILY.getRank(), values[4], Store.YES));
                doc.add(new TextField(RankType.GENUS.getRank(), values[5], Store.YES));
                if (rank == RankType.GENUS) {
                    doc.add(new TextField(IndexField.ID.toString(), values[6], Store.YES));
                    doc.add(new TextField(IndexField.ACCEPTED.toString(), values[8], Store.YES));
                    doc.add(new TextField(IndexField.HOMONYM.toString(), values[10], Store.YES));
                } else if (rank == RankType.SPECIES) {
                    doc.add(new TextField(RankType.SPECIES.getRank(), values[6], Store.YES));
                }
                doc.add(new TextField(IndexField.RANK.toString(), rank.getRank(), Store.YES));
                iw.addDocument(doc);
                count++;
            }

        }
        iw.commit();

        log.info("Finished indexing " + count + " IRMNG " + rank + " taxa.");
    } else
        log.warn("Unable to create IRMNG index.  Can't locate " + irmngExport);
}