List of usage examples for org.apache.lucene.index IndexWriter commit
@Override public final long commit() throws IOException
Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.
From source file:luceneexamples.NumericFieldDocument.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); for (int i = 8; i < 12; i++) { Document doc = new Document(); doc.add(new NumericField("int_field", Field.Store.YES, true).setIntValue(i)); System.out.println(doc);//from www . j av a 2s . co m writer.addDocument(doc); } writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); TopDocs td = searcher.search(new MatchAllDocsQuery(), 1000, new Sort(new SortField("int_field", SortField.INT))); assertThat(td.totalHits, is(4)); assertThat(searcher.doc(td.scoreDocs[0].doc).get("int_field"), equalTo("8")); assertThat(searcher.doc(td.scoreDocs[1].doc).get("int_field"), equalTo("9")); assertThat(searcher.doc(td.scoreDocs[2].doc).get("int_field"), equalTo("10")); assertThat(searcher.doc(td.scoreDocs[3].doc).get("int_field"), equalTo("11")); reader.close(); writer.close(); searcher.close(); directory.close(); }
From source file:luceneexamples.SortDocuments.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("str_field", "abc", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);//from w w w . jav a 2s. c o m Document doc2 = new Document(); doc2.add(new Field("str_field", "def", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc2); Document doc3 = new Document(); doc3.add(new Field("str_field", "hij", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc3); writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); TopDocs td = searcher.search(new MatchAllDocsQuery(), 1000, new Sort(new SortField("str_field", SortField.STRING))); assertThat(td.totalHits, is(3)); assertThat(searcher.doc(td.scoreDocs[0].doc).get("str_field"), equalTo("abc")); assertThat(searcher.doc(td.scoreDocs[1].doc).get("str_field"), equalTo("def")); assertThat(searcher.doc(td.scoreDocs[2].doc).get("str_field"), equalTo("hij")); td = searcher.search(new MatchAllDocsQuery(), 1000, new Sort(new SortField("str_field", SortField.STRING, true))); assertThat(td.totalHits, is(3)); assertThat(searcher.doc(td.scoreDocs[0].doc).get("str_field"), equalTo("hij")); assertThat(searcher.doc(td.scoreDocs[1].doc).get("str_field"), equalTo("def")); assertThat(searcher.doc(td.scoreDocs[2].doc).get("str_field"), equalTo("abc")); reader.close(); writer.close(); searcher.close(); directory.close(); }
From source file:luceneexamples.UpdateDocument.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("id", "001", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);//from www. j a v a 2 s. c o m writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer); TopDocs td = searcher.search(parser.parse("fox"), 1000); assertThat(td.totalHits, is(1)); Document doc2 = new Document(); doc.add(new Field("id", "001", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc2.add(new Field("str_field", "quick brown fox jumped over the lazy whale.", Field.Store.YES, Field.Index.ANALYZED)); writer.updateDocument(new Term("id", "001"), doc2); writer.commit(); searcher.close(); reader = reader.reopen(); searcher = new IndexSearcher(reader); td = searcher.search(parser.parse("dog"), 1000); assertThat(td.totalHits, is(0)); td = searcher.search(parser.parse("whale"), 1000); assertThat(td.totalHits, is(1)); writer.close(); searcher.close(); directory.close(); }
From source file:luceneingester.TrecIngester.java
License:Apache License
public static void main(String[] clArgs) throws Exception { Args args = new Args(clArgs); final String dirPath = args.getString("-indexPath") + "/index"; final String dataDir = args.getString("-dataDir"); final int docCountLimit = args.getInt("-docCountLimit"); // -1 means all docs from the source: final int numThreads = args.getInt("-threadCount"); final boolean verbose = args.getFlag("-verbose"); final boolean printDPS = args.getFlag("-printDPS"); final boolean doUpdate = args.getFlag("-update"); final boolean positions = args.getFlag("-positions"); args.check();/*from w ww . j av a 2 s . co m*/ final Analyzer a = new EnglishAnalyzer(); final TrecContentSource trecSource = createTrecSource(dataDir); final Directory dir = FSDirectory.open(Paths.get(dirPath)); System.out.println("Index path: " + dirPath); System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); System.out.println("Threads: " + numThreads); System.out.println("Verbose: " + (verbose ? "yes" : "no")); System.out.println("Positions: " + (positions ? "yes" : "no")); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } final IndexWriterConfig iwc = new IndexWriterConfig(a); if (doUpdate) { iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } System.out.println("IW config=" + iwc); final IndexWriter w = new IndexWriter(dir, iwc); IndexThreads threads = new IndexThreads(w, positions, trecSource, numThreads, docCountLimit, printDPS); System.out.println("\nIndexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done()) { Thread.sleep(100); } threads.stop(); final long t1 = System.currentTimeMillis(); System.out.println( "\nIndexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + w.maxDoc() + " docs"); if (!doUpdate && docCountLimit != -1 && w.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit); } if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } final long t2; t2 = System.currentTimeMillis(); final Map<String, String> commitData = new HashMap<String, String>(); commitData.put("userData", "multi"); w.setCommitData(commitData); w.commit(); final long t3 = System.currentTimeMillis(); System.out.println("\nIndexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)"); System.out.println("\nIndexer: at close: " + w.segString()); final long tCloseStart = System.currentTimeMillis(); w.close(); System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec"); dir.close(); final long tFinal = System.currentTimeMillis(); System.out.println("\nIndexer: finished (" + (tFinal - t0) / 1000.0 + " sec)"); System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed()); System.out.println( "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.)) + " GB/hour plain text"); }
From source file:model.Index.java
public static void main(String[] args) throws FileNotFoundException, IOException { //set the split word tech Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); //indexwriter config info IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); //open the index, if there is no index, build a new one indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); Directory directory = null;/*from w w w.j av a 2s . c o m*/ IndexWriter indexWrite = null; try { //set path of the original data directory = FSDirectory.open(new File(Path.IndexDir)); //if the directory is locked , unlock it if (IndexWriter.isLocked(directory)) { IndexWriter.unlock(directory); } //new a object indexWrite indexWrite = new IndexWriter(directory, indexWriterConfig); } catch (Exception e) { e.printStackTrace(); } PreProcessDoc getDoc = new PreProcessDoc(); WebDocument tempDoc = null; while ((tempDoc = getDoc.nextDocument()) != null) { Document doc = new Document(); doc.add(new TextField("link", tempDoc.getDocLink(), Store.YES)); doc.add(new TextField("content", tempDoc.getDocContent(), Store.YES)); try { //write doc into index indexWrite.addDocument(doc); } catch (Exception e) { e.printStackTrace(); } } //commit the data, if not , it would not be saved try { indexWrite.commit(); //close the resource indexWrite.close(); directory.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:mw.wikidump.MakeLuceneIndex.java
License:Open Source License
/** * @param args//from w ww. ja va 2 s.c o m * @throws IOException * @throws ParseException */ public static void main(String[] args) throws IOException, ParseException { String baseDir = ""; String wikiDumpFile = "enwiki-20110405-pages-articles.xml"; String luceneIndexName = "enwiki-20110405-lucene"; String logFile = luceneIndexName + ".log"; boolean bIgnoreStubs = false; String writeToTextFilesDir = ""; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-logfile")) logFile = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-ignorestubs")) bIgnoreStubs = true; if (args[i].equals("-writetotextfilesdir")) { writeToTextFilesDir = args[++i]; } } Map<String, Analyzer> analyzerPerField = new HashMap<>(); analyzerPerField.put("tokenized_title", new StandardAnalyzer()); analyzerPerField.put("contents", new StandardAnalyzer()); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerPerField); File basePath = new File(baseDir); File luceneIndex = new File(basePath.getCanonicalPath() + File.separator + luceneIndexName); logFile = basePath.getCanonicalPath() + File.separator + logFile; // log to file and console: // PlainLogger logger = new PlainLogger( logFile ); // log only to console: PlainLogger logger = new PlainLogger(); logger.log("Work directory: " + basePath.getCanonicalPath()); logger.log("Lucene index: " + luceneIndexName); logger.log("Wikipedia dumpfile: " + wikiDumpFile); logger.log(""); if (bIgnoreStubs) logger.log("Ignoring stubs"); else logger.log("Including stubs"); logger.log(""); // create the index Directory indexDirectory = FSDirectory.open(FileSystems.getDefault().getPath(baseDir)); IndexWriter indexWriter = new IndexWriter(indexDirectory, new IndexWriterConfig(analyzer)); Extractor wikidumpExtractor = new Extractor(basePath.getCanonicalPath() + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; FieldType fieldType = new FieldType(); fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); fieldType.setTokenized(true); fieldType.setStoreTermVectors(true); fieldType.setStoreTermVectorPositions(true); while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } Document doc = new Document(); ++iArticleCount; doc.add(new StoredField("path", String.format("%d", iArticleCount))); wikidumpExtractor.setTitleSeparator("_"); String title = wikidumpExtractor.getPageTitle(false).toLowerCase(); doc.add(new Field("title", title, fieldType)); wikidumpExtractor.setTitleSeparator(" "); doc.add(new Field("tokenized_title", wikidumpExtractor.getPageTitle(false).toLowerCase(), fieldType)); doc.add(new Field("categories", wikidumpExtractor.getPageCategories().toLowerCase(), fieldType)); doc.add(new Field("links", wikidumpExtractor.getPageLinks().toLowerCase(), fieldType)); doc.add(new Field("contents", wikidumpExtractor.getPageAbstract().toLowerCase(), fieldType)); indexWriter.addDocument(doc); if (!writeToTextFilesDir.isEmpty()) { String fileName = doc.get("title"); fileName = fileName.replace('/', '_'); writeToTextFile(writeToTextFilesDir, fileName, doc.get("contents")); } if (iArticleCount % 50000 == 0) { logger.add(iArticleCount + " (" + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s) "); iTime = System.nanoTime(); if (iArticleCount % 250000 == 0) { try { indexWriter.commit(); logger.add( "-- commit. Skipped page count " + iSkippedPageCount + " (+ " + iStubs + " stubs)"); logger.log(String.format(", time %sm", NanoTimeFormatter.getM(System.nanoTime() - iStartTime))); } catch (Exception e) { e.printStackTrace(); } } } } logger.log(""); logger.log( String.format("Overall time %s minutes, ", NanoTimeFormatter.getM(System.nanoTime() - iStartTime))); logger.add("collected " + iArticleCount + " articles, "); logger.add("skipped " + iSkippedPageCount + " nonarticle pages,"); logger.log("skipped " + iStubs + " stubs."); logger.log(""); iTime = System.nanoTime(); logger.add(" closing..."); indexWriter.close(); logger.log(" done in " + NanoTimeFormatter.getS(System.nanoTime() - iTime) + "s."); logger.close(); System.exit(0); }
From source file:net.bobah.mail.Indexer.java
License:Apache License
private void runEx() throws Exception { final File dir = new File(config.getProperty("net.bobah.mail.local.folder")); if (!dir.exists() || !dir.isDirectory()) { throw new IllegalArgumentException(String.format("\"%s\" does not exist or is not a directory", dir)); }//from ww w . j a v a2s. c o m Collection<File> files = findFiles(dir, new FileFilter() { @Override public boolean accept(File file) { return file.getName().endsWith(".eml"); } }, new Comparator<File>() { @Override public int compare(File l, File r) { return Long.compare(l.lastModified(), r.lastModified()); } }); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); final File indexDir = new File(dir, "index"); final boolean indexExisted = indexDir.exists(); if (!indexExisted) indexDir.mkdirs(); final Directory idx = FSDirectory.open(indexDir); final IndexWriter writer = new IndexWriter(idx, iwc); final IndexReader reader = indexExisted ? DirectoryReader.open(idx) : null; final IndexSearcher searcher = indexExisted ? new IndexSearcher(reader) : null; //final AtomicLong counter = new AtomicLong(0l); try { for (final File file : files) { executor.submit(new Runnable() { @Override public void run() { try { index(file, writer, searcher); //if (counter.incrementAndGet() % 100 == 0) writer.commit(); // TODO: VL: make batch size configurable } catch (Exception e) { throw new RuntimeException(e); } } }); } shutdownExecutor(executor, log); // TODO: VL: delete stale documents from the index writer.commit(); log.info("committed index updates"); searcher.search(new MatchAllDocsQuery(), new Collector() { @Override public void setScorer(Scorer scorer) throws IOException { } @Override public void setNextReader(AtomicReaderContext unused) throws IOException { } @Override public void collect(int docID) throws IOException { Document doc = reader.document(docID); final String path = doc.get("path"); if (path != null) { try { final File file = new File(path); if (!file.exists()) { log.info("deleting index for {}", doc.get("id")); writer.deleteDocuments(new Term("id", doc.get("id"))); } } catch (SecurityException e) { log.error("exception", e); } } } @Override public boolean acceptsDocsOutOfOrder() { return true; } }); writer.commit(); log.info("committed index deletions"); } finally { try { // close writer without commit (see explicit commits above) writer.rollback(); } catch (IOException e) { log.error("exception while closing writer", e); } } }
From source file:net.ion.craken.node.problem.distribute.DemoActions.java
License:Open Source License
/** * Creates a new document having just one field containing a string * /*from w ww . j a v a 2s .c o m*/ * @param line The text snippet to add * @throws IOException */ public void addNewDocument(String line) throws IOException { IndexWriterConfig iwconfig = new IndexWriterConfig(SearchConstant.LuceneVersion, analyzer); IndexWriter iw = new IndexWriter(idir, iwconfig); try { Document doc = new Document(); Field field = new Field(MAIN_FIELD, line, Store.YES, Index.ANALYZED); doc.add(field); iw.addDocument(doc); iw.commit(); } finally { iw.close(); } }
From source file:net.mad.ads.services.geo.lucene.GeoIpIndex.java
License:Open Source License
public void importIPs(String path) { try {/*from w ww . java 2s . c o m*/ if (!path.endsWith("/")) { path += "/"; } Directory directory = FSDirectory.open(new File(db, "geo")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_31, new StandardAnalyzer(Version.LUCENE_31)); IndexWriter writer = new IndexWriter(directory, config); List<String> cnames = new ArrayList<String>(); BufferedReader br = new BufferedReader(new FileReader(path + "GeoLiteCity-Blocks.csv")); CSVReader reader = new CSVReader(br, ',', '\"', 2); // Scanner scanner = new Scanner(new FileReader(filename)); // boolean firstLine = true; int count = 0; String[] values; Map<String, Map<String, String>> locations = getLocations(path); while ((values = reader.readNext()) != null) { String ipfrom = values[0]; String ipto = values[1]; String locid = values[2]; Map<String, String> location = locations.get(locid); Document doc = new Document(); doc.add(new Field("city", location.get("city"), Store.YES, Index.ANALYZED)); doc.add(new Field("postalcode", location.get("postalcode"), Store.YES, Index.ANALYZED)); doc.add(new Field("country", location.get("country"), Store.YES, Index.ANALYZED)); doc.add(new Field("region", location.get("region"), Store.YES, Index.ANALYZED)); doc.add(new Field("latitude", location.get("latitude"), Store.YES, Index.ANALYZED)); doc.add(new Field("longitude", location.get("longitude"), Store.YES, Index.ANALYZED)); NumericField ipfromField = new NumericField("ipfrom", 8, Store.YES, true); ipfromField.setLongValue(Long.parseLong(ipfrom.trim())); doc.add(ipfromField); NumericField iptoField = new NumericField("ipto", 8, Store.YES, true); iptoField.setLongValue(Long.parseLong(ipto.trim())); doc.add(iptoField); // doc.add(new NumericField("ipto", ipto, Store.YES, Index.ANALYZED)); writer.addDocument(doc); count++; if (count % 100 == 0) { writer.commit(); } } System.out.println(count + " Eintrge importiert"); writer.optimize(); writer.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:net.semanticmetadata.lire.benchmarking.CombinationTest.java
License:Open Source License
public void testIndexing() throws IOException { ChainedDocumentBuilder cb = new ChainedDocumentBuilder(); cb.addBuilder(new SurfDocumentBuilder()); cb.addBuilder(DocumentBuilderFactory.getColorLayoutBuilder()); System.out.println("-< Getting files to index >--------------"); ArrayList<String> images = FileUtils.getAllImages(new File(dataPath), true); System.out.println("-< Indexing " + images.size() + " files >--------------"); IndexWriter iw = LuceneUtils.createIndexWriter(indexPath, true); int count = 0; long time = System.currentTimeMillis(); for (String identifier : images) { Document doc = cb.createDocument(new FileInputStream(identifier), identifier); iw.addDocument(doc);//from ww w. j a va 2 s . com count++; if (count % 100 == 0) System.out.println(count + " files indexed."); // if (count == 200) break; } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image."); iw.commit(); iw.close(); System.out.println("-< Local features are getting clustered >--------------"); BOVWBuilder sh = new BOVWBuilder(IndexReader.open(FSDirectory.open(new File(indexPath))), new SurfFeature(), 200, 8000); sh.index(); System.out.println("-< Indexing finished >--------------"); }