Example usage for org.apache.lucene.index IndexWriter commit

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter commit.

Prototype

@Override
public final long commit() throws IOException

Source Link

Document

Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.

Usage

From source file:kbp2013.index.IndexSourceCorpus.java

License:Open Source License

public static void main(String[] args) throws IOException {

    initializeFromDefault();//from  w ww .  j  a va 2  s  .  c  o  m

    int managed = 0; // counter to count idents
    int counted = 0; // when to display
    int tocount = 10;

    System.out.println("Indexing to directory '" + luceneIndex + "'...");

    INDEX_DIR = new File(luceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(luceneIndex));

    // Open lucene stuff 
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    // iwc.setRAMBufferSizeMB(1024); // http://wiki.apache.org/lucene-java/ImproveIndexingSpeed
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, checkindex
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    final File docDir = new File(home);
    System.out.println("Indexing directory '" + home + "'...");
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    // read all the files
    BufferedReader reader = new BufferedReader(new FileReader(homelist));

    // read line by line each file name
    String text = "";
    boolean verbose = true;

    while ((text = reader.readLine()) != null) {

        String filename = home + text;
        final File testFile = new File(filename);

        // verbose - remove from one line files
        if (verbose) {
            System.out.println("---V-->" + "Indexing content of " + filename);
        }

        if (testFile.isFile() && !filename.contains("\\.gz")) {

            // open file and read
            FileReader fread = new FileReader(filename);
            BufferedReader readerDoc = new BufferedReader(fread);

            // initialize variable for loop
            String fileRef = ""; // the line containing the document id
            String fromfile = ""; // the first reader for all the file
            String textdoc = ""; // inside the file the reader for the document

            while ((fromfile = readerDoc.readLine()) != null) {
                if (fromfile.toUpperCase().contains("<DOC ID=") || fromfile.toUpperCase().contains("<DOC>")) {

                    String fromdoc = fromfile; // begin to index the DOCID (to keep good offset for collection of mention)
                    textdoc = fromfile; // initialize variable and keep the first line

                    // accumulate all the content
                    while (!fromdoc.toUpperCase().contains("</DOC>")) {

                        // collect the doc id
                        // store the current file ref
                        // it can come :
                        //     - from the last fromfile (first iteration)
                        //     - from a current iteration of fromdoc (any iteration)

                        if (fromdoc.toUpperCase().contains("<DOC ID=")
                                || fromdoc.toUpperCase().contains("<DOCID>")) {
                            fileRef = fromdoc;
                        }

                        // accumulate the complete document for later offset reading of mention
                        fromdoc = readerDoc.readLine();
                        textdoc = textdoc + "\n" + fromdoc;

                    }

                    // locate id
                    // 2 forms
                    // <DOCID> ALHURRA_NEWS13_ARB_20050412_130100-2.LDC2006E92 </DOCID>
                    // <doc id="bolt-eng-DF-183-195681-7948494">
                    // form 1
                    String idStr = fileRef;

                    if (idStr.contains("<DOCID>")) {
                        idStr = idStr.replace("<DOCID>", "");
                        idStr = idStr.replace("</DOCID>", "");
                        idStr = idStr.replace(" ", ""); // retire l'espace
                    }
                    if (idStr.contains("<DOC id=")) {

                        idStr = idStr.replace("<DOC id=\"", "");
                        idStr = idStr.replaceAll("\".+>$", "");
                        //idStr = idStr.replaceAll("\">$", "");
                    }
                    // lower case ->new corpus of LDC
                    /*
                    if (idStr.contains("<docid>")){
                       idStr = idStr.replace("<docid>", "");
                       idStr = idStr.replace("</docid>", "");
                       idStr = idStr.replace(" ", ""); // retire l'espace
                    }
                    if (idStr.contains("<doc id=")){
                            
                       idStr = idStr.replace("<doc id=\"", "");
                       idStr = idStr.replaceAll("\".+>$", "");
                       // idStr = idStr.replaceAll("\">$", "");
                    }      
                    */

                    indexDocs(writer, idStr, textdoc);

                    // display info
                    managed++;
                    counted++;

                    // verbose remove for 1 doc files
                    if (verbose) {
                        System.out.println(
                                "---V-->" + counted + ":" + filename + ":" + idStr + ":" + textdoc.length());
                    }

                    if (managed > tocount) {
                        managed = 0;
                        System.out.println(counted + ":" + filename + ":------>" + idStr);

                        // clean the writer
                        //writer.waitForMerges();
                        //writer.forceMergeDeletes();
                        writer.commit();
                    }
                } // end of if

            } // end of while
            readerDoc.close();
            fread.close();

        } else {

            System.out.println(counted + ":Non lisible ou non requis:" + filename);

        }

    }

    // close properly the index writer 
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and has to be regenerated
    writer.close();
    reader.close();

}

From source file:kbp2013.index.IndexWikipediaCorpus.java

License:Open Source License

public static void main(String[] args) throws IOException {

    initializeFromDefault();/*www . jav a2  s . c o  m*/

    int managed = 0; // counter to count idents
    int counted = 0; // when to display
    int tocount = 1000;
    int saved = 0;

    System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'...");

    INDEX_DIR = new File(wikiluceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(wikiluceneIndex));

    // Open lucene stuff 
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);
    // configure Lucene Stuff
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, check index
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    // --------------------------
    //
    // Open the Wikipedia Dump
    //
    //---------------------------
    BufferedReader reader = new BufferedReader(new FileReader(wikidump));

    // read the domains
    String text = "";
    ArrayList domain = new ArrayList(); // the content retrieved according to the page key

    while (!text.contains("</siteinfo>")) {
        text = reader.readLine();
        if (text.contains("<namespace key=") && !text.contains("<namespace key=\"0")) {

            String thisnamespace = text.replaceAll("<namespace key=[^>]+>", "");
            thisnamespace = thisnamespace.replaceAll("</namespace>", "");
            thisnamespace = thisnamespace.replaceAll("^[ ]+", "");
            thisnamespace = thisnamespace + ":";
            if (!thisnamespace.contentEquals("")) {
                domain.add(thisnamespace);
                System.out.println("Registered domain:" + thisnamespace + ";");
            }
        }
    }

    System.out.println("--------------------------------");

    // read the pages
    while ((text = reader.readLine()) != null) {

        String textdoc = ""; // inside the file, the reader for the document
        String pagename = "";
        boolean tosave = true;

        // beginning of a page
        // accumulate
        if (text.contains("<page>")) {

            textdoc = text;

            while (!text.contains("</page>")) {
                text = reader.readLine();
                textdoc = textdoc + text;

                if (text.contains("<title>")) {

                    pagename = text.replaceAll("<title>", "");
                    pagename = pagename.replaceAll("</title>", "");
                    pagename = pagename.replaceAll("[ ]{2,10}", "");
                    //System.out.println("Page:" + pagename);

                }

                // safety

            }

            // after page reading index document
            // verify if document 
            //         A) is not a redirect
            //         B) is not from a domain
            for (int a = 0; a < domain.size(); a++) {
                String domaintosearch = domain.get(a).toString();
                if (pagename.toLowerCase().contains(domaintosearch.toLowerCase())) {
                    System.out.println("Specific page:" + pagename);
                    tosave = false;
                }
            }
            /*
            if (textdoc.contains("[A-Za-z ]+:")){
               System.out.println("Specific page domain:" + pagename);
               tosave = false;
            }*/
            if (textdoc.contains("#REDIRECT")) {
                // System.out.println("Redirect:" + pagename);
                tosave = false;
            }

            if (tosave) {
                saved++;
                indexDocs(writer, pagename, textdoc);
            }

            // display info
            managed++;
            counted++;

            if (managed > tocount) {
                managed = 0;
                System.out.println(counted + ":" + saved + ":" + pagename + ":------>" + textdoc.length());
                // System.out.println(textdoc);
                writer.commit();
            }
        }

    } // end while

    // close properly the index writer 
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and has to be regenerated
    writer.close();
    reader.close();

}

From source file:lia.chapter2.IndexingTest.java

License:Apache License

public void testDeleteBeforeOptimize() throws IOException {
    IndexWriter writer = getWriter();
    assertEquals(2, writer.numDocs()); //A
    writer.deleteDocuments(new Term("id", "1")); //B
    writer.commit();
    assertTrue(writer.hasDeletions()); //1
    assertEquals(2, writer.maxDoc()); //2
    assertEquals(1, writer.numDocs()); //2
    writer.close();//from   ww w  .j av  a2  s.c  om

}

From source file:lia.chapter2.IndexingTest.java

License:Apache License

public void testDeleteAfterOptimize() throws IOException {
    IndexWriter writer = getWriter();
    assertEquals(2, writer.numDocs());/*w ww  .j a v  a2s.c  o  m*/
    writer.deleteDocuments(new Term("id", "1"));
    writer.commit();
    assertFalse(writer.hasDeletions());
    assertEquals(1, writer.maxDoc()); //C
    assertEquals(1, writer.numDocs()); //C
    writer.close();
}

From source file:lia.chapter2.VerboseIndexing.java

License:Apache License

private void index() throws IOException {

    Directory dir = new RAMDirectory();

    IndexWriter writer = Utils.getIndexWriterWithInfoStream(dir);

    for (int i = 0; i < 100; i++) {
        Document doc = new Document();
        doc.add(new Field("keyword", "goober", StringField.TYPE_STORED));
        writer.addDocument(doc);/*  ww w .j a  va 2s. c o m*/
    }
    writer.commit();
    writer.close();
}

From source file:liredemo.flickr.FlickrIndexingThread.java

License:Open Source License

public void run() {
    DecimalFormat df = (DecimalFormat) DecimalFormat.getInstance();
    df.setMaximumFractionDigits(0);//from ww  w.  ja v  a2  s  . c  o  m
    df.setMinimumFractionDigits(0);
    try {
        File cacheDir = new File(cacheDirectory);
        if (!cacheDir.exists())
            cacheDir.mkdir();
        parent.progressBarIndexing.setValue(0);
        parent.progressBarIndexing.setString("Getting photos from Flickr");
        List<FlickrPhoto> images = new LinkedList<FlickrPhoto>();
        HashSet<String> titles = new HashSet<String>(numberOfPhotosToIndex);
        try {
            while (images.size() < numberOfPhotosToIndex) {
                List<FlickrPhoto> photos = FlickrPhotoGrabber.getRecentPhotos();
                for (FlickrPhoto photo : photos) {
                    // check if it is already there:
                    if (!titles.contains(photo.url)) {
                        titles.add(photo.url);
                        if (images.size() < numberOfPhotosToIndex)
                            images.add(photo);
                    } else {
                        try {
                            Thread.sleep(150);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                    }
                }
                parent.progressBarIndexing
                        .setString("Getting photos from Flickr: " + images.size() + " found.");
            }
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        }
        //            PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
        //            wrapper.addAnalyzer("tags", new WhitespaceAnalyzer(Version.LUCENE_33));

        //        iw = new IndexWriter(indexPath + "-new", wrapper, true, IndexWriter.MaxFieldLength.UNLIMITED);

        boolean create = !parent.checkBoxAddToExisintgIndex.isSelected();
        IndexWriter iw;
        if (create) {
            iw = LuceneUtils.createIndexWriter(parent.textfieldIndexName.getText(), true);
            //                iw = new IndexWriter(FSDirectory.open(new File(parent.textfieldIndexName.getText())), new SimpleAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED);
        } else {
            iw = LuceneUtils.createIndexWriter(parent.textfieldIndexName.getText(), false);
            //                iw = new IndexWriter(FSDirectory.open(new File(parent.textfieldIndexName.getText())), new SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
        }
        int builderIdx = parent.selectboxDocumentBuilder.getSelectedIndex();
        DocumentBuilder builder = new MetadataBuilder();
        int count = 0;
        long time = System.currentTimeMillis();
        FlickrDownloadThread downloader = new FlickrDownloadThread(images, builder);
        new Thread(downloader).start();
        Document doc = null;
        while ((doc = downloader.getCurrentDoc()) != null) {
            try {
                iw.addDocument(doc);
            } catch (Exception e) {
                System.err.println("Could not add document");
                // e.printStackTrace();
            }
            count++;
            float percentage = (float) count / (float) images.size();
            parent.progressBarIndexing.setValue((int) Math.floor(100f * percentage));
            float msleft = (float) (System.currentTimeMillis() - time) / percentage;
            float secLeft = msleft * (1 - percentage) / 1000f;
            String toPaint;
            if (secLeft > 60)
                toPaint = "~ " + Math.ceil(secLeft / 60) + " min. left";
            else if (secLeft > 30)
                toPaint = "< 1 min. left";
            else
                toPaint = "< 30 sec. left";
            parent.progressBarIndexing.setString(toPaint);
        }
        long timeTaken = (System.currentTimeMillis() - time);
        float sec = ((float) timeTaken) / 1000f;
        parent.progressBarIndexing.setValue(100);
        parent.progressBarIndexing.setString(Math.round(sec) + " sec. for " + count + " files");
        parent.buttonStartIndexing.setEnabled(true);
        iw.commit();
        iw.close();

    } catch (IOException ex) {
        Logger.getLogger("global").log(Level.SEVERE, null, ex);
    }
}

From source file:lsre.utils.LuceneUtils.java

License:Open Source License

public static void commitWriter(IndexWriter iw) throws IOException {
    iw.commit();
}

From source file:lucene.demo.search.FileSearcher.java

License:Apache License

private void removeDocs(Query query) throws IOException {
    Directory dir = FSDirectory.open(new File(indexPath));
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_48);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    IndexWriter writer = new IndexWriter(dir, iwc);
    writer.deleteDocuments(query);/* ww w.j  av  a  2  s .c  o  m*/
    writer.commit();
    writer.close();
}

From source file:lucene.security.IndexSearcherTest.java

License:Apache License

private void runTest(int expected, Collection<String> readAuthorizations,
        Collection<String> discoverAuthorizations, Collection<String> discoverableFields)
        throws IOException, ParseException {
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43));
    Directory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, conf);
    writer.addDocument(getEmpty());//from  ww w  . ja v  a 2 s  .  c om
    writer.commit();
    writer.addDocument(getDoc("(a&b)|d", null, "f1", "f2"));
    writer.addDocument(getDoc("a&b&c", null, "f1", "f2"));
    writer.addDocument(getDoc("a&b&c&e", "a&b&c", "f1", "f2"));
    writer.addDocument(getDoc(null, null, "f1", "f2"));// can't find
    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    List<AtomicReaderContext> leaves = reader.leaves();
    assertEquals(2, leaves.size());
    SecureIndexSearcher searcher = new SecureIndexSearcher(reader, getAccessControlFactory(),
            readAuthorizations, discoverAuthorizations, toSet(discoverableFields));

    String queryStr = "text";
    Query query = new QueryParser(Version.LUCENE_43, "text", new StandardAnalyzer(Version.LUCENE_43))
            .parse(queryStr);
    TopDocs topDocs = searcher.search(query, 10);

    assertEquals(expected, topDocs.totalHits);
    DocumentAuthorizations readDocumentAuthorizations = new DocumentAuthorizations(readAuthorizations);
    DocumentAuthorizations discoverDocumentAuthorizations = new DocumentAuthorizations(discoverAuthorizations);
    DocumentVisibilityEvaluator readVisibilityEvaluator = new DocumentVisibilityEvaluator(
            readDocumentAuthorizations);
    DocumentVisibilityEvaluator discoverVisibilityEvaluator = new DocumentVisibilityEvaluator(
            discoverDocumentAuthorizations);
    for (int i = 0; i < topDocs.totalHits & i < topDocs.scoreDocs.length; i++) {
        Document doc = searcher.doc(topDocs.scoreDocs[i].doc);
        String read = doc.get("_read_");
        String discover = doc.get("_discover_");
        if (read != null && discover != null) {
            DocumentVisibility readVisibility = new DocumentVisibility(read);
            DocumentVisibility discoverVisibility = new DocumentVisibility(discover);
            assertTrue(readVisibilityEvaluator.evaluate(readVisibility)
                    || discoverVisibilityEvaluator.evaluate(discoverVisibility));
        } else if (read != null) {
            DocumentVisibility readVisibility = new DocumentVisibility(read);
            assertTrue(readVisibilityEvaluator.evaluate(readVisibility));
        } else if (discover != null) {
            DocumentVisibility discoverVisibility = new DocumentVisibility(discover);
            assertTrue(discoverVisibilityEvaluator.evaluate(discoverVisibility));
            // Since this document is only discoverable validate fields that are
            // being returned.
            validateDiscoverFields(doc, discoverableFields);
        } else {
            fail("Should not fetch empty document.");
        }
    }
    searcher.search(query, new Collector() {

        @Override
        public void setScorer(Scorer scorer) throws IOException {
        }

        @Override
        public void setNextReader(AtomicReaderContext context) throws IOException {
            assertTrue(context.reader() instanceof SecureAtomicReader);
        }

        @Override
        public void collect(int doc) throws IOException {

        }

        @Override
        public boolean acceptsDocsOutOfOrder() {
            return false;
        }
    });
}

From source file:luceneexamples.AddDocument.java

License:Apache License

@Test
public void index() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    Document doc = new Document();
    doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc);/*from  www  .  ja v  a  2s .co m*/
    writer.commit();
    IndexReader reader = IndexReader.open(writer, true);
    IndexSearcher searcher = new IndexSearcher(reader);
    QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer);
    TopDocs td = searcher.search(parser.parse("fox"), 1000);
    assertThat(td.totalHits, is(1));

    Document doc2 = new Document();
    doc2.add(new Field("str_field", "quick brown dog jumped over the lazy fox.", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc2);
    writer.commit();

    td = searcher.search(parser.parse("fox"), 1000);
    assertThat(td.totalHits, is(1));

    searcher.close();
    reader = reader.reopen();
    searcher = new IndexSearcher(reader);

    td = searcher.search(parser.parse("fox"), 1000);
    assertThat(td.totalHits, is(2));

    writer.close();
    searcher.close();
    directory.close();
}