Example usage for org.apache.lucene.index IndexWriter forceMerge

List of usage examples for org.apache.lucene.index IndexWriter forceMerge

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter forceMerge.

Prototype

public void forceMerge(int maxNumSegments) throws IOException 

Source Link

Document

Forces merge policy to merge segments until there are <= maxNumSegments .

Usage

From source file:MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();/*from  w  w  w.  j a  v a  2  s  . co m*/
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        Analyzer analyzer = new WikipediaAnalyzer();
        //         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:MakeLuceneIndexPreprocessed.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";

    String inputLuceneIndexName = "2013-06-18-lucene-gab";
    String luceneIndexName = "2013-06-18-lucene-gab-standard";

    System.currentTimeMillis();//from w ww  .  java 2s.  co  m

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-inputluceneindex"))
            inputLuceneIndexName = args[++i];

        if (args[i].equals("-outputluceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

    }
    String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt";
    String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt";
    String logPath = baseDir + inputLuceneIndexName + ".log";

    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();
    logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0");

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);
        //         iwc.setSimilarity(new ESASimilarity());

        IndexWriter writer = new IndexWriter(dir, iwc);

        int iArticleCount = 0;
        int iSkippedPageCount = 0;

        BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath));
        BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath));
        String lineText = rawTextReader.readLine();
        String lineLinks = artikelInLinksReader.readLine();

        while (lineText != null) {
            //            String title = lineText.substring(0, lineText.indexOf("\t")); 
            //            while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){
            //               lineLinks = artikelInLinksReader.readLine();
            //            }
            int endOfTitle = lineText.indexOf("\t");
            String title = lineText.substring(0, endOfTitle);

            if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) {
                ++iArticleCount;
                Document doc = new Document();
                doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                        + lineText.substring(endOfTitle + 1), Field.Store.NO));
                //               System.out.println(title + " " + 
                //               title + " " + 
                //               title + " " + 
                //               title + " " +
                //               lineText.substring(endOfTitle+1));
                writer.addDocument(doc);

                if (iArticleCount % 1000 == 0) {
                    writer.commit();
                    logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount
                            + " iSkippedPageCount: " + iSkippedPageCount);
                    logger.flush();
                }
            }
            lineText = rawTextReader.readLine();
            lineLinks = artikelInLinksReader.readLine();
        }
        rawTextReader.close();
        artikelInLinksReader.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the IRMNG homonym index based on the DWCA and species homonyms supplied from the NSL
 * @param exportsDir//from w  ww.j  a  v  a2s .  c om
 * @param indexDir
 * @throws Exception
 */
public void createIrmngIndex(String exportsDir, String indexDir) throws Exception {
    Analyzer analyzer = new LowerCaseKeywordAnalyzer();
    IndexWriter irmngWriter = createIndexWriter(new File(indexDir + File.separator + "irmng"), analyzer, true);
    indexIrmngDwcA(irmngWriter, irmngDwcaDirectory);
    indexIRMNG(irmngWriter, exportsDir + File.separator + "ala-species-homonyms.txt", RankType.SPECIES);
    irmngWriter.forceMerge(1);
    irmngWriter.close();
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

public void createIndex(String exportsDir, String indexDir, String acceptedFile, String synonymFile,
        String irmngDwca, boolean generateSciNames, boolean generateCommonNames) throws Exception {

    Analyzer analyzer = new LowerCaseKeywordAnalyzer();
    //generate the extra id index
    createExtraIdIndex(indexDir + File.separator + "id",
            new File(exportsDir + File.separator + "identifiers.txt"));
    if (generateSciNames) {
        indexALA(createIndexWriter(new File(indexDir + File.separator + "cb"), analyzer, true), acceptedFile,
                synonymFile);//exportsDir + File.separator + "ala_accepted_concepts_dump.txt");//, exportsDir + File.separator + lexFile);
        //IRMNG index to aid in the resolving of homonyms
        IndexWriter irmngWriter = createIndexWriter(new File(indexDir + File.separator + "irmng"), analyzer,
                true);//from  ww  w  .j av  a2  s  .c  o  m
        indexIrmngDwcA(irmngWriter, irmngDwca);

        indexIRMNG(irmngWriter, exportsDir + File.separator + "ala-species-homonyms.txt", RankType.SPECIES);
        irmngWriter.forceMerge(1);
        irmngWriter.close();
    }
    if (generateCommonNames) {
        //vernacular index to search for common names
        indexCommonNames(createIndexWriter(new File(indexDir + File.separator + "vernacular"),
                new KeywordAnalyzer(), true), exportsDir, indexDir);
    }
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the temporary index that provides a lookup of checklist bank id to
 * GUID/*from www.  j a  v  a2  s . c o  m*/
 */
private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception {
    System.out.println("Starting to create the tmp guid index...");
    IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true);
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(
            new FileReader(cbExportFile), '\t', '"', '/', 1);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        Document doc = new Document();
        String id = values[POS_ID];
        String guid = values[POS_LSID];
        doc.add(new StringField("id", id, Store.YES));
        if (StringUtils.isEmpty(id))
            guid = id;

        doc.add(new StoredField("guid", guid));
        iw.addDocument(doc);
    }
    System.out.println("Finished writing the tmp guid index...");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    //As of lucene 4.0 all IndexReaders are read only
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid"))));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

private void indexALA(IndexWriter iw, String file, String synonymFile) throws Exception {
    int records = 0;
    long time = System.currentTimeMillis();
    au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), '\t',
            '"', '\\', 1);
    for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {

        String lsid = values[POS_LSID];
        String id = values[POS_ID];
        String rank = values[POS_RANK];
        int rankId = -1;
        try {//  w  w  w .j  a v a  2 s  .  c o m
            rankId = Integer.parseInt(values[POS_RANK_ID]);
        } catch (Exception e) {
        }

        String acceptedValues = values[POS_ACC_LSID];
        float boost = 1.0f;
        //give the major ranks a larger boost
        if (rankId % 1000 == 0) {
            boost = 5.0f;
        }
        //give non-col concepts a higher boost
        String source = values[POS_SRC];
        if (!source.trim().equals("") && !source.equalsIgnoreCase("CoL")) {
            boost = boost * 2;
        }

        Document doc = createALAIndexDocument(values[POS_SCI_NAME], id, lsid, values[POS_RANK_ID],
                values[POS_RANK], values[POS_K], values[POS_KID], values[POS_P], values[POS_PID], values[POS_C],
                values[POS_CID], values[POS_O], values[POS_OID], values[POS_F], values[POS_FID], values[POS_G],
                values[POS_GID], values[POS_S], values[POS_SID], values[POS_LFT], values[POS_RGT],
                acceptedValues, values[POS_SP_EPITHET], values[POS_INFRA_EPITHET], values[POS_AUTHOR], boost);

        //add the excluded information if applicable
        if ("T".equals(values[POS_EXCLUDED]) || "Y".equals(values[POS_EXCLUDED])) {
            doc.add(new TextField(NameIndexField.SYNONYM_TYPE.toString(),
                    SynonymType.EXCLUDES.getId().toString(), Store.YES));
        }
        if (doc != null) {
            iw.addDocument(doc);
            records++;
            if (records % 100000 == 0) {
                log.info("Processed " + records + " in " + (System.currentTimeMillis() - time) + " msecs");
            }
        }
    }
    addExtraALAConcept(iw, extraALAConcepts);
    //add the synonyms
    addALASyonyms(iw, synonymFile);
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    log.info("Lucene index created - processed a total of " + records + " records in "
            + (System.currentTimeMillis() - time) + " msecs ");
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Indexes common names from CoL and ANBG for use in the Common name search.
 *
 * @param iw  The index writer to write the common documents to
 * @param exportDir  The directory that contains the common name export files.
 * @param indexDir The directory in which to create the index.
 * @throws Exception//  w w  w . j  a v a 2 s  .com
 */
private void indexCommonNames(IndexWriter iw, String exportDir, String indexDir) throws Exception {
    log.info("Creating Common Names Index ...");

    //TODO think about adding additional sources for common names

    IndexSearcher currentNameSearcher = new IndexSearcher(
            DirectoryReader.open(FSDirectory.open(new File(indexDir + File.separator + "cb"))));
    IndexSearcher extraSearcher = new IndexSearcher(
            DirectoryReader.open(FSDirectory.open(new File(indexDir + File.separator + "id"))));

    addCoLCommonNames(iw, currentNameSearcher);
    addAnbgCommonNames(afdFile, iw, currentNameSearcher, extraSearcher, '\t');
    addAnbgCommonNames(apniFile, iw, currentNameSearcher, extraSearcher, ',');

    iw.commit();
    iw.forceMerge(1);
    iw.close();
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that will provide a lookup up of lsid to "real lsid".
 * <p/>/*from ww w  .  j a  va  2  s .c  om*/
 * This deals with the following situations:
 * - common names that are sourced from CoL (LSIDs will be mapped to corresponding ANBG LSID)
 * - Multiple ANBG LSIDs exist for the same scientific name and more than 1 are mapped to the same common name.
 *
 * @param idFile
 * @throws Exception
 */
private void createExtraIdIndex(String idxLocation, File idFile) throws Exception {
    CSVReader reader = new CSVReader(new FileReader(idFile), '\t', '"', '~');//CSVReader.build(idFile, "UTF-8", "\t", '"', 0);
    File indexDir = new File(idxLocation);
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);//new IndexWriter(FSDirectory.open(indexDir), new KeywordAnalyzer(), true, MaxFieldLength.UNLIMITED);
    String[] values = null;
    while ((values = reader.readNext()) != null) {

        if (values != null && values.length >= 3) {
            Document doc = new Document();
            //doc.add(new Field("lsid", values[2], Store.NO, Index.NOT_ANALYZED));
            doc.add(new StringField("lsid", values[2], Store.NO));
            //doc.add(new Field("reallsid", values[1], Store.YES, Index.NO));
            doc.add(new StoredField("reallsid", values[1]));
            iw.addDocument(doc);
        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    idSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates a temporary index that stores the taxon concept LSIDs that were
 * included in the last ANBG exports.//www. j  a  v  a  2  s. co  m
 *
 * @param tcFileName
 * @return
 * @throws Exception
 */
private IndexSearcher createTmpIndex(String tcFileName) throws Exception {
    //creating the tmp index in the /tmp/taxonConcept directory
    CSVReader reader = new CSVReader(new FileReader(new File(tcFileName)), '\t', '"', '~');
    File indexDir = new File("/tmp/taxonConcept");
    IndexWriter iw = createIndexWriter(indexDir, new KeywordAnalyzer(), true);
    String[] values = null;
    while ((values = reader.readNext()) != null) {
        if (values != null && values.length > 1) {
            //just add the LSID to the index
            Document doc = new Document();

            doc.add(new StringField("lsid", values[0], Store.NO));
            iw.addDocument(doc);

        }
    }
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(indexDir)));
}

From source file:au.org.ala.names.search.DwcaNameIndexer.java

License:Open Source License

/**
 * Creates the name matching index based on a complete list of names supplied in a single DwCA
 *
 * @param loadingIndex True when the loading index should be created. This is necessary to generate the index, but you may wish to skip this step if it has be generated earlier
 * @param sciIndex True when the name matching index should be generated
 * @param indexDirectory The directory in which to create the name matching index
 * @param tmpLoadIndex The directory in which to create the temporary loading index
 * @param namesDwc The absolute path to the directory that contains the unzipped DWC archive to index
 * @param irmngDwc The absolute path to the directory that contains the unzipped IRMNG DWCA
 * @param commonNameFile//from  ww w  . j  a v  a 2s  .c o m
 * @throws Exception
 */
public void create(boolean loadingIndex, boolean sciIndex, String indexDirectory, String tmpLoadIndex,
        String namesDwc, String irmngDwc, String commonNameFile) throws Exception {
    dirTmpIndex = tmpLoadIndex;
    LowerCaseKeywordAnalyzer analyzer = new LowerCaseKeywordAnalyzer();
    if (loadingIndex) {
        createLoadingIndex(tmpLoadIndex, namesDwc);
    }
    if (sciIndex) {
        writer = createIndexWriter(new File(indexDirectory + File.separator + "cb"), analyzer, true);
        generateIndex();
        addSynonymsToIndex(namesDwc);
        writer.commit();
        writer.forceMerge(1);
        writer.close();
    }
    if (irmngDwc != null && new File(irmngDwc).exists()) {
        IndexWriter irmngWriter = createIndexWriter(new File(indexDirectory + File.separator + "irmng"),
                analyzer, true);
        this.indexIrmngDwcA(irmngWriter, irmngDwc);
        irmngWriter.forceMerge(1);
        irmngWriter.close();
    }
    if (commonNameFile != null && new File(commonNameFile).exists()) {
        //index the common names
        indexCommonNames(createIndexWriter(new File(indexDirectory + File.separator + "vernacular"),
                new KeywordAnalyzer(), true), commonNameFile);
    }
}