List of usage examples for org.apache.lucene.index IndexWriter commit
@Override public final long commit() throws IOException
Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.
From source file:kbp2013.index.IndexSourceCorpus.java
License:Open Source License
public static void main(String[] args) throws IOException { initializeFromDefault();//from w ww . j a va 2 s . c o m int managed = 0; // counter to count idents int counted = 0; // when to display int tocount = 10; System.out.println("Indexing to directory '" + luceneIndex + "'..."); INDEX_DIR = new File(luceneIndex); if (INDEX_DIR.exists() && create == 1) { System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); System.exit(1); } Directory dir = FSDirectory.open(new File(luceneIndex)); // Open lucene stuff Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // iwc.setRAMBufferSizeMB(1024); // http://wiki.apache.org/lucene-java/ImproveIndexingSpeed iwc.setMaxThreadStates(100); // manage append mode if (create == 0) { // add new document to an existing index iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // if appending, checkindex if (checkindex == 1) { System.out.println("Checking index ..."); CheckIndex ci = new CheckIndex(dir); ci.checkIndex(); System.out.println("End of Checking index"); } } else { iwc.setOpenMode(OpenMode.CREATE); } // build writer IndexWriter writer = new IndexWriter(dir, iwc); final File docDir = new File(home); System.out.println("Indexing directory '" + home + "'..."); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } // read all the files BufferedReader reader = new BufferedReader(new FileReader(homelist)); // read line by line each file name String text = ""; boolean verbose = true; while ((text = reader.readLine()) != null) { String filename = home + text; final File testFile = new File(filename); // verbose - remove from one line files if (verbose) { System.out.println("---V-->" + "Indexing content of " + filename); } if (testFile.isFile() && !filename.contains("\\.gz")) { // open file and read FileReader fread = new FileReader(filename); BufferedReader readerDoc = new BufferedReader(fread); // initialize variable for loop String fileRef = ""; // the line containing the document id String fromfile = ""; // the first reader for all the file String textdoc = ""; // inside the file the reader for the document while ((fromfile = readerDoc.readLine()) != null) { if (fromfile.toUpperCase().contains("<DOC ID=") || fromfile.toUpperCase().contains("<DOC>")) { String fromdoc = fromfile; // begin to index the DOCID (to keep good offset for collection of mention) textdoc = fromfile; // initialize variable and keep the first line // accumulate all the content while (!fromdoc.toUpperCase().contains("</DOC>")) { // collect the doc id // store the current file ref // it can come : // - from the last fromfile (first iteration) // - from a current iteration of fromdoc (any iteration) if (fromdoc.toUpperCase().contains("<DOC ID=") || fromdoc.toUpperCase().contains("<DOCID>")) { fileRef = fromdoc; } // accumulate the complete document for later offset reading of mention fromdoc = readerDoc.readLine(); textdoc = textdoc + "\n" + fromdoc; } // locate id // 2 forms // <DOCID> ALHURRA_NEWS13_ARB_20050412_130100-2.LDC2006E92 </DOCID> // <doc id="bolt-eng-DF-183-195681-7948494"> // form 1 String idStr = fileRef; if (idStr.contains("<DOCID>")) { idStr = idStr.replace("<DOCID>", ""); idStr = idStr.replace("</DOCID>", ""); idStr = idStr.replace(" ", ""); // retire l'espace } if (idStr.contains("<DOC id=")) { idStr = idStr.replace("<DOC id=\"", ""); idStr = idStr.replaceAll("\".+>$", ""); //idStr = idStr.replaceAll("\">$", ""); } // lower case ->new corpus of LDC /* if (idStr.contains("<docid>")){ idStr = idStr.replace("<docid>", ""); idStr = idStr.replace("</docid>", ""); idStr = idStr.replace(" ", ""); // retire l'espace } if (idStr.contains("<doc id=")){ idStr = idStr.replace("<doc id=\"", ""); idStr = idStr.replaceAll("\".+>$", ""); // idStr = idStr.replaceAll("\">$", ""); } */ indexDocs(writer, idStr, textdoc); // display info managed++; counted++; // verbose remove for 1 doc files if (verbose) { System.out.println( "---V-->" + counted + ":" + filename + ":" + idStr + ":" + textdoc.length()); } if (managed > tocount) { managed = 0; System.out.println(counted + ":" + filename + ":------>" + idStr); // clean the writer //writer.waitForMerges(); //writer.forceMergeDeletes(); writer.commit(); } } // end of if } // end of while readerDoc.close(); fread.close(); } else { System.out.println(counted + ":Non lisible ou non requis:" + filename); } } // close properly the index writer // !! Caution !! in case of error, if this is not closed, the index is corrupted // and has to be regenerated writer.close(); reader.close(); }
From source file:kbp2013.index.IndexWikipediaCorpus.java
License:Open Source License
public static void main(String[] args) throws IOException { initializeFromDefault();/*www . jav a2 s . c o m*/ int managed = 0; // counter to count idents int counted = 0; // when to display int tocount = 1000; int saved = 0; System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'..."); INDEX_DIR = new File(wikiluceneIndex); if (INDEX_DIR.exists() && create == 1) { System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); System.exit(1); } Directory dir = FSDirectory.open(new File(wikiluceneIndex)); // Open lucene stuff Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); // configure Lucene Stuff iwc.setMaxThreadStates(100); // manage append mode if (create == 0) { // add new document to an existing index iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // if appending, check index if (checkindex == 1) { System.out.println("Checking index ..."); CheckIndex ci = new CheckIndex(dir); ci.checkIndex(); System.out.println("End of Checking index"); } } else { iwc.setOpenMode(OpenMode.CREATE); } // build writer IndexWriter writer = new IndexWriter(dir, iwc); // -------------------------- // // Open the Wikipedia Dump // //--------------------------- BufferedReader reader = new BufferedReader(new FileReader(wikidump)); // read the domains String text = ""; ArrayList domain = new ArrayList(); // the content retrieved according to the page key while (!text.contains("</siteinfo>")) { text = reader.readLine(); if (text.contains("<namespace key=") && !text.contains("<namespace key=\"0")) { String thisnamespace = text.replaceAll("<namespace key=[^>]+>", ""); thisnamespace = thisnamespace.replaceAll("</namespace>", ""); thisnamespace = thisnamespace.replaceAll("^[ ]+", ""); thisnamespace = thisnamespace + ":"; if (!thisnamespace.contentEquals("")) { domain.add(thisnamespace); System.out.println("Registered domain:" + thisnamespace + ";"); } } } System.out.println("--------------------------------"); // read the pages while ((text = reader.readLine()) != null) { String textdoc = ""; // inside the file, the reader for the document String pagename = ""; boolean tosave = true; // beginning of a page // accumulate if (text.contains("<page>")) { textdoc = text; while (!text.contains("</page>")) { text = reader.readLine(); textdoc = textdoc + text; if (text.contains("<title>")) { pagename = text.replaceAll("<title>", ""); pagename = pagename.replaceAll("</title>", ""); pagename = pagename.replaceAll("[ ]{2,10}", ""); //System.out.println("Page:" + pagename); } // safety } // after page reading index document // verify if document // A) is not a redirect // B) is not from a domain for (int a = 0; a < domain.size(); a++) { String domaintosearch = domain.get(a).toString(); if (pagename.toLowerCase().contains(domaintosearch.toLowerCase())) { System.out.println("Specific page:" + pagename); tosave = false; } } /* if (textdoc.contains("[A-Za-z ]+:")){ System.out.println("Specific page domain:" + pagename); tosave = false; }*/ if (textdoc.contains("#REDIRECT")) { // System.out.println("Redirect:" + pagename); tosave = false; } if (tosave) { saved++; indexDocs(writer, pagename, textdoc); } // display info managed++; counted++; if (managed > tocount) { managed = 0; System.out.println(counted + ":" + saved + ":" + pagename + ":------>" + textdoc.length()); // System.out.println(textdoc); writer.commit(); } } } // end while // close properly the index writer // !! Caution !! in case of error, if this is not closed, the index is corrupted // and has to be regenerated writer.close(); reader.close(); }
From source file:lia.chapter2.IndexingTest.java
License:Apache License
public void testDeleteBeforeOptimize() throws IOException { IndexWriter writer = getWriter(); assertEquals(2, writer.numDocs()); //A writer.deleteDocuments(new Term("id", "1")); //B writer.commit(); assertTrue(writer.hasDeletions()); //1 assertEquals(2, writer.maxDoc()); //2 assertEquals(1, writer.numDocs()); //2 writer.close();//from ww w .j av a2 s.c om }
From source file:lia.chapter2.IndexingTest.java
License:Apache License
public void testDeleteAfterOptimize() throws IOException { IndexWriter writer = getWriter(); assertEquals(2, writer.numDocs());/*w ww .j a v a2s.c o m*/ writer.deleteDocuments(new Term("id", "1")); writer.commit(); assertFalse(writer.hasDeletions()); assertEquals(1, writer.maxDoc()); //C assertEquals(1, writer.numDocs()); //C writer.close(); }
From source file:lia.chapter2.VerboseIndexing.java
License:Apache License
private void index() throws IOException { Directory dir = new RAMDirectory(); IndexWriter writer = Utils.getIndexWriterWithInfoStream(dir); for (int i = 0; i < 100; i++) { Document doc = new Document(); doc.add(new Field("keyword", "goober", StringField.TYPE_STORED)); writer.addDocument(doc);/* ww w .j a va 2s. c o m*/ } writer.commit(); writer.close(); }
From source file:liredemo.flickr.FlickrIndexingThread.java
License:Open Source License
public void run() { DecimalFormat df = (DecimalFormat) DecimalFormat.getInstance(); df.setMaximumFractionDigits(0);//from ww w. ja v a2 s . c o m df.setMinimumFractionDigits(0); try { File cacheDir = new File(cacheDirectory); if (!cacheDir.exists()) cacheDir.mkdir(); parent.progressBarIndexing.setValue(0); parent.progressBarIndexing.setString("Getting photos from Flickr"); List<FlickrPhoto> images = new LinkedList<FlickrPhoto>(); HashSet<String> titles = new HashSet<String>(numberOfPhotosToIndex); try { while (images.size() < numberOfPhotosToIndex) { List<FlickrPhoto> photos = FlickrPhotoGrabber.getRecentPhotos(); for (FlickrPhoto photo : photos) { // check if it is already there: if (!titles.contains(photo.url)) { titles.add(photo.url); if (images.size() < numberOfPhotosToIndex) images.add(photo); } else { try { Thread.sleep(150); } catch (InterruptedException e) { e.printStackTrace(); } } } parent.progressBarIndexing .setString("Getting photos from Flickr: " + images.size() + " found."); } } catch (SAXException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } // PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer()); // wrapper.addAnalyzer("tags", new WhitespaceAnalyzer(Version.LUCENE_33)); // iw = new IndexWriter(indexPath + "-new", wrapper, true, IndexWriter.MaxFieldLength.UNLIMITED); boolean create = !parent.checkBoxAddToExisintgIndex.isSelected(); IndexWriter iw; if (create) { iw = LuceneUtils.createIndexWriter(parent.textfieldIndexName.getText(), true); // iw = new IndexWriter(FSDirectory.open(new File(parent.textfieldIndexName.getText())), new SimpleAnalyzer(), create, IndexWriter.MaxFieldLength.UNLIMITED); } else { iw = LuceneUtils.createIndexWriter(parent.textfieldIndexName.getText(), false); // iw = new IndexWriter(FSDirectory.open(new File(parent.textfieldIndexName.getText())), new SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); } int builderIdx = parent.selectboxDocumentBuilder.getSelectedIndex(); DocumentBuilder builder = new MetadataBuilder(); int count = 0; long time = System.currentTimeMillis(); FlickrDownloadThread downloader = new FlickrDownloadThread(images, builder); new Thread(downloader).start(); Document doc = null; while ((doc = downloader.getCurrentDoc()) != null) { try { iw.addDocument(doc); } catch (Exception e) { System.err.println("Could not add document"); // e.printStackTrace(); } count++; float percentage = (float) count / (float) images.size(); parent.progressBarIndexing.setValue((int) Math.floor(100f * percentage)); float msleft = (float) (System.currentTimeMillis() - time) / percentage; float secLeft = msleft * (1 - percentage) / 1000f; String toPaint; if (secLeft > 60) toPaint = "~ " + Math.ceil(secLeft / 60) + " min. left"; else if (secLeft > 30) toPaint = "< 1 min. left"; else toPaint = "< 30 sec. left"; parent.progressBarIndexing.setString(toPaint); } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; parent.progressBarIndexing.setValue(100); parent.progressBarIndexing.setString(Math.round(sec) + " sec. for " + count + " files"); parent.buttonStartIndexing.setEnabled(true); iw.commit(); iw.close(); } catch (IOException ex) { Logger.getLogger("global").log(Level.SEVERE, null, ex); } }
From source file:lsre.utils.LuceneUtils.java
License:Open Source License
public static void commitWriter(IndexWriter iw) throws IOException { iw.commit(); }
From source file:lucene.demo.search.FileSearcher.java
License:Apache License
private void removeDocs(Query query) throws IOException { Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_48); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, iwc); writer.deleteDocuments(query);/* ww w.j av a 2 s .c o m*/ writer.commit(); writer.close(); }
From source file:lucene.security.IndexSearcherTest.java
License:Apache License
private void runTest(int expected, Collection<String> readAuthorizations, Collection<String> discoverAuthorizations, Collection<String> discoverableFields) throws IOException, ParseException { IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_43, new StandardAnalyzer(Version.LUCENE_43)); Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, conf); writer.addDocument(getEmpty());//from ww w . ja v a 2 s . c om writer.commit(); writer.addDocument(getDoc("(a&b)|d", null, "f1", "f2")); writer.addDocument(getDoc("a&b&c", null, "f1", "f2")); writer.addDocument(getDoc("a&b&c&e", "a&b&c", "f1", "f2")); writer.addDocument(getDoc(null, null, "f1", "f2"));// can't find writer.close(); DirectoryReader reader = DirectoryReader.open(dir); List<AtomicReaderContext> leaves = reader.leaves(); assertEquals(2, leaves.size()); SecureIndexSearcher searcher = new SecureIndexSearcher(reader, getAccessControlFactory(), readAuthorizations, discoverAuthorizations, toSet(discoverableFields)); String queryStr = "text"; Query query = new QueryParser(Version.LUCENE_43, "text", new StandardAnalyzer(Version.LUCENE_43)) .parse(queryStr); TopDocs topDocs = searcher.search(query, 10); assertEquals(expected, topDocs.totalHits); DocumentAuthorizations readDocumentAuthorizations = new DocumentAuthorizations(readAuthorizations); DocumentAuthorizations discoverDocumentAuthorizations = new DocumentAuthorizations(discoverAuthorizations); DocumentVisibilityEvaluator readVisibilityEvaluator = new DocumentVisibilityEvaluator( readDocumentAuthorizations); DocumentVisibilityEvaluator discoverVisibilityEvaluator = new DocumentVisibilityEvaluator( discoverDocumentAuthorizations); for (int i = 0; i < topDocs.totalHits & i < topDocs.scoreDocs.length; i++) { Document doc = searcher.doc(topDocs.scoreDocs[i].doc); String read = doc.get("_read_"); String discover = doc.get("_discover_"); if (read != null && discover != null) { DocumentVisibility readVisibility = new DocumentVisibility(read); DocumentVisibility discoverVisibility = new DocumentVisibility(discover); assertTrue(readVisibilityEvaluator.evaluate(readVisibility) || discoverVisibilityEvaluator.evaluate(discoverVisibility)); } else if (read != null) { DocumentVisibility readVisibility = new DocumentVisibility(read); assertTrue(readVisibilityEvaluator.evaluate(readVisibility)); } else if (discover != null) { DocumentVisibility discoverVisibility = new DocumentVisibility(discover); assertTrue(discoverVisibilityEvaluator.evaluate(discoverVisibility)); // Since this document is only discoverable validate fields that are // being returned. validateDiscoverFields(doc, discoverableFields); } else { fail("Should not fetch empty document."); } } searcher.search(query, new Collector() { @Override public void setScorer(Scorer scorer) throws IOException { } @Override public void setNextReader(AtomicReaderContext context) throws IOException { assertTrue(context.reader() instanceof SecureAtomicReader); } @Override public void collect(int doc) throws IOException { } @Override public boolean acceptsDocsOutOfOrder() { return false; } }); }
From source file:luceneexamples.AddDocument.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);/*from www . ja v a 2s .co m*/ writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer); TopDocs td = searcher.search(parser.parse("fox"), 1000); assertThat(td.totalHits, is(1)); Document doc2 = new Document(); doc2.add(new Field("str_field", "quick brown dog jumped over the lazy fox.", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc2); writer.commit(); td = searcher.search(parser.parse("fox"), 1000); assertThat(td.totalHits, is(1)); searcher.close(); reader = reader.reopen(); searcher = new IndexSearcher(reader); td = searcher.search(parser.parse("fox"), 1000); assertThat(td.totalHits, is(2)); writer.close(); searcher.close(); directory.close(); }