Example usage for org.apache.lucene.index CheckIndex CheckIndex

List of usage examples for org.apache.lucene.index CheckIndex CheckIndex

Introduction

In this page you can find the example usage for org.apache.lucene.index CheckIndex CheckIndex.

Prototype

public CheckIndex(Directory dir) throws IOException 

Source Link

Document

Create a new CheckIndex on the directory.

Usage

From source file:com.vmware.dcp.services.common.LuceneDocumentIndexService.java

License:Open Source License

private void upgradeIndex(Directory dir) throws IOException {
    boolean doUpgrade = false;
    IndexWriterConfig iwc = new IndexWriterConfig(null);

    CheckIndex chkIndex = new CheckIndex(dir);

    try {//  w  ww.j  a  v a2s  . c o  m
        for (CheckIndex.Status.SegmentInfoStatus segmentInfo : chkIndex.checkIndex().segmentInfos) {
            if (!segmentInfo.version.equals(Version.LATEST)) {
                logInfo("Found Index version %s", segmentInfo.version.toString());
                doUpgrade = true;
                break;
            }
        }
    } finally {
        chkIndex.close();
    }

    if (doUpgrade) {
        logInfo("Upgrading index to %s", Version.LATEST.toString());
        new IndexUpgrader(dir, iwc, false).upgrade();
        this.indexUpdateTimeMicros = Utils.getNowMicrosUtc();
    }
}

From source file:com.zimbra.cs.index.LuceneIndex.java

License:Open Source License

/**
 * Run a sanity check for the index. Callers are responsible to make sure the index is not opened by any writer.
 *
 * @param out info stream where messages should go. If null, no messages are printed.
 * @return true if no problems were found, otherwise false
 * @throws IOException failed to verify, but it doesn't necessarily mean the index is corrupted.
 */// w w w. ja v a2  s . c o  m
@Override
public boolean verify(PrintStream out) throws IOException {
    if (!IndexReader.indexExists(luceneDirectory)) {
        out.println("index does not exist or no segments file found: " + luceneDirectory.getDirectory());
        return true;
    }
    CheckIndex check = new CheckIndex(luceneDirectory);
    if (out != null) {
        check.setInfoStream(out);
    }
    CheckIndex.Status status = check.checkIndex();
    return status.clean;
}

From source file:com.zimbra.cs.index.LuceneViewer.java

License:Open Source License

private static void doCheck(CommandLine cl) throws Exception {
    Console console = new Console(cl.hasOption(CLI.O_VERBOSE));

    String indexDir = cl.getOptionValue(CLI.O_INPUT);
    console.info("Checking index " + indexDir);

    Directory dir = null;//from ww  w .ja va2  s.  com
    try {
        dir = LuceneDirectory.open(new File(indexDir));
    } catch (Throwable t) {
        console.info("ERROR: could not open directory \"" + indexDir + "\"; exiting");
        t.printStackTrace(System.out);
        System.exit(1);
    }

    CheckIndex checker = new CheckIndex(dir);
    checker.setInfoStream(System.out);

    Status result = checker.checkIndex();
    console.info("Result:" + (result.clean ? "clean" : "not clean"));
}

From source file:io.anserini.integration.EndToEndTest.java

License:Apache License

protected void checkIndex() throws IOException {
    ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
    Directory dir = FSDirectory.open(Paths.get(this.indexOutputPrefix + this.collectionClass));
    CheckIndex checker = new CheckIndex(dir);
    checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8));
    if (VERBOSE)//ww  w.ja v  a2 s . co  m
        checker.setInfoStream(System.out);
    CheckIndex.Status indexStatus = checker.checkIndex();
    if (!indexStatus.clean) {
        System.out.println("CheckIndex failed");
        System.out.println(bos.toString(IOUtils.UTF_8));
        fail();
    }

    final CheckIndex.Status.SegmentInfoStatus seg = indexStatus.segmentInfos.get(0);
    assertTrue(seg.openReaderPassed);

    assertNotNull(seg.diagnostics);

    assertNotNull(seg.fieldNormStatus);
    assertNull(seg.fieldNormStatus.error);
    assertEquals(this.fieldNormStatusTotalFields, seg.fieldNormStatus.totFields);

    assertNotNull(seg.termIndexStatus);
    assertNull(seg.termIndexStatus.error);
    assertEquals(this.termIndexStatusTermCount, seg.termIndexStatus.termCount);
    assertEquals(this.termIndexStatusTotFreq, seg.termIndexStatus.totFreq);
    assertEquals(this.termIndexStatusTotPos, seg.termIndexStatus.totPos);

    assertNotNull(seg.storedFieldStatus);
    assertNull(seg.storedFieldStatus.error);
    assertEquals(this.storedFieldStatusTotalDocCounts, seg.storedFieldStatus.docCount);
    assertEquals(this.storedFieldStatusTotFields, seg.storedFieldStatus.totFields);

    assertTrue(seg.diagnostics.size() > 0);
    final List<String> onlySegments = new ArrayList<>();
    onlySegments.add("_0");

    assertTrue(checker.checkIndex(onlySegments).clean);
    checker.close();
}

From source file:kbp2013.index.IndexSourceCorpus.java

License:Open Source License

public static void main(String[] args) throws IOException {

    initializeFromDefault();/* ww  w .  jav  a 2 s  .  c  o m*/

    int managed = 0; // counter to count idents
    int counted = 0; // when to display
    int tocount = 10;

    System.out.println("Indexing to directory '" + luceneIndex + "'...");

    INDEX_DIR = new File(luceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(luceneIndex));

    // Open lucene stuff 
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    // iwc.setRAMBufferSizeMB(1024); // http://wiki.apache.org/lucene-java/ImproveIndexingSpeed
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, checkindex
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    final File docDir = new File(home);
    System.out.println("Indexing directory '" + home + "'...");
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    // read all the files
    BufferedReader reader = new BufferedReader(new FileReader(homelist));

    // read line by line each file name
    String text = "";
    boolean verbose = true;

    while ((text = reader.readLine()) != null) {

        String filename = home + text;
        final File testFile = new File(filename);

        // verbose - remove from one line files
        if (verbose) {
            System.out.println("---V-->" + "Indexing content of " + filename);
        }

        if (testFile.isFile() && !filename.contains("\\.gz")) {

            // open file and read
            FileReader fread = new FileReader(filename);
            BufferedReader readerDoc = new BufferedReader(fread);

            // initialize variable for loop
            String fileRef = ""; // the line containing the document id
            String fromfile = ""; // the first reader for all the file
            String textdoc = ""; // inside the file the reader for the document

            while ((fromfile = readerDoc.readLine()) != null) {
                if (fromfile.toUpperCase().contains("<DOC ID=") || fromfile.toUpperCase().contains("<DOC>")) {

                    String fromdoc = fromfile; // begin to index the DOCID (to keep good offset for collection of mention)
                    textdoc = fromfile; // initialize variable and keep the first line

                    // accumulate all the content
                    while (!fromdoc.toUpperCase().contains("</DOC>")) {

                        // collect the doc id
                        // store the current file ref
                        // it can come :
                        //     - from the last fromfile (first iteration)
                        //     - from a current iteration of fromdoc (any iteration)

                        if (fromdoc.toUpperCase().contains("<DOC ID=")
                                || fromdoc.toUpperCase().contains("<DOCID>")) {
                            fileRef = fromdoc;
                        }

                        // accumulate the complete document for later offset reading of mention
                        fromdoc = readerDoc.readLine();
                        textdoc = textdoc + "\n" + fromdoc;

                    }

                    // locate id
                    // 2 forms
                    // <DOCID> ALHURRA_NEWS13_ARB_20050412_130100-2.LDC2006E92 </DOCID>
                    // <doc id="bolt-eng-DF-183-195681-7948494">
                    // form 1
                    String idStr = fileRef;

                    if (idStr.contains("<DOCID>")) {
                        idStr = idStr.replace("<DOCID>", "");
                        idStr = idStr.replace("</DOCID>", "");
                        idStr = idStr.replace(" ", ""); // retire l'espace
                    }
                    if (idStr.contains("<DOC id=")) {

                        idStr = idStr.replace("<DOC id=\"", "");
                        idStr = idStr.replaceAll("\".+>$", "");
                        //idStr = idStr.replaceAll("\">$", "");
                    }
                    // lower case ->new corpus of LDC
                    /*
                    if (idStr.contains("<docid>")){
                       idStr = idStr.replace("<docid>", "");
                       idStr = idStr.replace("</docid>", "");
                       idStr = idStr.replace(" ", ""); // retire l'espace
                    }
                    if (idStr.contains("<doc id=")){
                            
                       idStr = idStr.replace("<doc id=\"", "");
                       idStr = idStr.replaceAll("\".+>$", "");
                       // idStr = idStr.replaceAll("\">$", "");
                    }      
                    */

                    indexDocs(writer, idStr, textdoc);

                    // display info
                    managed++;
                    counted++;

                    // verbose remove for 1 doc files
                    if (verbose) {
                        System.out.println(
                                "---V-->" + counted + ":" + filename + ":" + idStr + ":" + textdoc.length());
                    }

                    if (managed > tocount) {
                        managed = 0;
                        System.out.println(counted + ":" + filename + ":------>" + idStr);

                        // clean the writer
                        //writer.waitForMerges();
                        //writer.forceMergeDeletes();
                        writer.commit();
                    }
                } // end of if

            } // end of while
            readerDoc.close();
            fread.close();

        } else {

            System.out.println(counted + ":Non lisible ou non requis:" + filename);

        }

    }

    // close properly the index writer 
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and has to be regenerated
    writer.close();
    reader.close();

}

From source file:kbp2013.index.IndexWikipediaCorpus.java

License:Open Source License

public static void main(String[] args) throws IOException {

    initializeFromDefault();/*from  w ww . j  av  a  2s.c o m*/

    int managed = 0; // counter to count idents
    int counted = 0; // when to display
    int tocount = 1000;
    int saved = 0;

    System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'...");

    INDEX_DIR = new File(wikiluceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(wikiluceneIndex));

    // Open lucene stuff 
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);
    // configure Lucene Stuff
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, check index
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    // --------------------------
    //
    // Open the Wikipedia Dump
    //
    //---------------------------
    BufferedReader reader = new BufferedReader(new FileReader(wikidump));

    // read the domains
    String text = "";
    ArrayList domain = new ArrayList(); // the content retrieved according to the page key

    while (!text.contains("</siteinfo>")) {
        text = reader.readLine();
        if (text.contains("<namespace key=") && !text.contains("<namespace key=\"0")) {

            String thisnamespace = text.replaceAll("<namespace key=[^>]+>", "");
            thisnamespace = thisnamespace.replaceAll("</namespace>", "");
            thisnamespace = thisnamespace.replaceAll("^[ ]+", "");
            thisnamespace = thisnamespace + ":";
            if (!thisnamespace.contentEquals("")) {
                domain.add(thisnamespace);
                System.out.println("Registered domain:" + thisnamespace + ";");
            }
        }
    }

    System.out.println("--------------------------------");

    // read the pages
    while ((text = reader.readLine()) != null) {

        String textdoc = ""; // inside the file, the reader for the document
        String pagename = "";
        boolean tosave = true;

        // beginning of a page
        // accumulate
        if (text.contains("<page>")) {

            textdoc = text;

            while (!text.contains("</page>")) {
                text = reader.readLine();
                textdoc = textdoc + text;

                if (text.contains("<title>")) {

                    pagename = text.replaceAll("<title>", "");
                    pagename = pagename.replaceAll("</title>", "");
                    pagename = pagename.replaceAll("[ ]{2,10}", "");
                    //System.out.println("Page:" + pagename);

                }

                // safety

            }

            // after page reading index document
            // verify if document 
            //         A) is not a redirect
            //         B) is not from a domain
            for (int a = 0; a < domain.size(); a++) {
                String domaintosearch = domain.get(a).toString();
                if (pagename.toLowerCase().contains(domaintosearch.toLowerCase())) {
                    System.out.println("Specific page:" + pagename);
                    tosave = false;
                }
            }
            /*
            if (textdoc.contains("[A-Za-z ]+:")){
               System.out.println("Specific page domain:" + pagename);
               tosave = false;
            }*/
            if (textdoc.contains("#REDIRECT")) {
                // System.out.println("Redirect:" + pagename);
                tosave = false;
            }

            if (tosave) {
                saved++;
                indexDocs(writer, pagename, textdoc);
            }

            // display info
            managed++;
            counted++;

            if (managed > tocount) {
                managed = 0;
                System.out.println(counted + ":" + saved + ":" + pagename + ":------>" + textdoc.length());
                // System.out.println(textdoc);
                writer.commit();
            }
        }

    } // end while

    // close properly the index writer 
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and has to be regenerated
    writer.close();
    reader.close();

}

From source file:kbp2013.index.IndexWikipediaCorpus_v2.java

License:Open Source License

/**
 * //from   ww  w  .  j  a  v a2s  . com
 * 
 * @param args
 * @throws IOException
 * @throws Exception
 */
public static void main(String[] args) throws IOException, Exception {

    initializeFromDefault();

    System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'...");

    INDEX_DIR = new File(wikiluceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    if (wikidump.endsWith(".bzip2") == false) {
        System.out.println("NOTICE: The Wikipedia dump must be in bzip2 format.");
        System.exit(0);
    }

    Directory dir = FSDirectory.open(new File(wikiluceneIndex));

    // Open lucene stuff
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);
    // configure Lucene Stuff
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, check index
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    // --------------------------
    //
    // Open the Wikipedia Dump
    //
    //---------------------------
    //Processing the large xml file in bzip2 format
    InputStream fileInputStream = new BufferedInputStream(new FileInputStream(wikidump));
    BZip2InputStream inputStream = new BZip2InputStream(fileInputStream, false);

    InputStreamReader isr = new InputStreamReader(inputStream);
    BufferedReader reader = new BufferedReader(isr);

    String line;
    //temporary stores the content of each file
    StringBuilder pageBuffer = new StringBuilder();
    //contains the title of the current page
    String docTitle = "";
    //contains the content of the current page
    String content = "";

    int docCount = 0; // number of documents that have been stored
    Date start = new Date(); //log the time when the indexing process starts

    while ((line = reader.readLine()) != null) {
        if (StringUtils.contains(line, "</page>") == true) {
            if (pageBuffer.length() > 0) {
                //get the title of the page
                int startIndex = pageBuffer.toString().indexOf("<title>") + 7;
                int endIndex = pageBuffer.toString().indexOf("</title>");
                docTitle = pageBuffer.toString().substring(startIndex, endIndex);
                //get the content of the page
                int startPageIndex = pageBuffer.toString().indexOf("<page>");
                content = pageBuffer.toString().substring(startPageIndex) + "</page>";
                //verify the namespace of the page, it should be 0
                int namespaceValue = Integer
                        .parseInt(content.substring(content.indexOf("<ns>") + 4, content.indexOf("</ns>")));
                if (namespaceValue != 0) {
                    //reset buffer
                    pageBuffer = new StringBuilder();
                    continue;
                }
                //verify that it is not a redirect page
                if (content.indexOf("<text xml:space=\"preserve\">#REDIRECT") != -1) {
                    //reset buffer
                    pageBuffer = new StringBuilder();
                    continue;
                } else {
                    indexDocument(writer, content, docTitle.toLowerCase());
                    System.err.println("Processed " + docCount + " documents");
                }
                docCount++;

            }
            //reset buffer
            pageBuffer = new StringBuilder();
        }
        pageBuffer.append(line);

    }
    fileInputStream.close();
    writer.close();

    Date end = new Date();

    // close properly the index writer
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and have to be regenerated
    reader.close();

    System.err.println(end.getTime() - start.getTime() + " total milliseconds");

}

From source file:net.dataforte.infinispan.amanuensis.ExecutorContext.java

License:Open Source License

public synchronized Status check(boolean fix) {
    try {/* w  ww . ja v  a2 s.  c o  m*/
        forceUnlock();
        CheckIndex checker = new CheckIndex(directory);
        Status check = checker.checkIndex();
        if (check.clean) {
            log.info("Index " + AmanuensisManager.getUniqueDirectoryIdentifier(directory) + " is clean");
        } else {
            log.warn("Index " + AmanuensisManager.getUniqueDirectoryIdentifier(directory)
                    + " is NOT clean, fixing...");
            checker.fixIndex(check);
        }
        return check;
    } catch (IOException e) {
        log.error("", e);
        return null;
    }
}

From source file:org.apache.solr.index.hdfs.CheckHdfsIndex.java

License:Apache License

@SuppressForbidden(reason = "System.out required: command line tool")
protected static int doMain(String[] args) throws IOException, InterruptedException {
    CheckIndex.Options opts;/*from w  ww  .  ja v  a2s .com*/
    try {
        opts = CheckIndex.parseOptions(args);
    } catch (IllegalArgumentException e) {
        System.out.println(e.getMessage());
        return 1;
    }

    if (!CheckIndex.assertsOn()) {
        System.out.println(
                "\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled");
    }

    if (opts.getDirImpl() != null) {
        System.out.println(
                "\nIgnoring specified -dir-impl, instead using " + HdfsDirectory.class.getSimpleName());
    }

    System.out.println("\nOpening index @ " + opts.getIndexPath() + "\n");

    Directory directory;
    try {
        directory = new HdfsDirectory(new Path(opts.getIndexPath()), getConf());
    } catch (IOException e) {
        System.out.println("ERROR: could not open hdfs directory \"" + opts.getIndexPath() + "\"; exiting");
        e.printStackTrace(System.out);
        return 1;
    }

    try (Directory dir = directory; CheckIndex checker = new CheckIndex(dir)) {
        opts.setOut(System.out);
        return checker.doCheck(opts);
    }
}

From source file:org.elasticsearch.common.util.MultiDataPathUpgrader.java

License:Apache License

/**
 * Runs check-index on the target shard and throws an exception if it failed
 *//*w  ww .  j a  v a2 s. c o m*/
public void checkIndex(ShardPath targetPath) throws IOException {
    BytesStreamOutput os = new BytesStreamOutput();
    PrintStream out = new PrintStream(os, false, Charsets.UTF_8.name());
    try (Directory directory = new SimpleFSDirectory(targetPath.resolveIndex());
            final CheckIndex checkIndex = new CheckIndex(directory)) {
        checkIndex.setInfoStream(out);
        CheckIndex.Status status = checkIndex.checkIndex();
        out.flush();
        if (!status.clean) {
            logger.warn("check index [failure]\n{}", new String(os.bytes().toBytes(), Charsets.UTF_8));
            throw new IllegalStateException("index check failure");
        }
    }
}