List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:aos.lucene.tools.BerkeleyDbJEIndexer.java
License:Apache License
public static void main(String[] args) throws IOException, DatabaseException { if (args.length != 1) { System.err.println("Usage: BerkeleyDbIndexer <index dir>"); System.exit(-1);/*from w w w . j a v a 2s .c om*/ } File indexFile = new File(args[0]); if (indexFile.exists()) { File[] files = indexFile.listFiles(); for (int i = 0; i < files.length; i++) if (files[i].getName().startsWith("__")) files[i].delete(); indexFile.delete(); } indexFile.mkdir(); EnvironmentConfig envConfig = new EnvironmentConfig(); DatabaseConfig dbConfig = new DatabaseConfig(); envConfig.setTransactional(true); envConfig.setAllowCreate(true); dbConfig.setTransactional(true); dbConfig.setAllowCreate(true); Environment env = new Environment(indexFile, envConfig); Transaction txn = env.beginTransaction(null, null); Database index = env.openDatabase(txn, "__index__", dbConfig); Database blocks = env.openDatabase(txn, "__blocks__", dbConfig); txn.commit(); txn = env.beginTransaction(null, null); JEDirectory directory = new JEDirectory(txn, index, blocks); IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.merge(writer.getNextMerge()); writer.close(); directory.close(); txn.commit(); index.close(); blocks.close(); env.close(); LOGGER.info("Indexing Complete"); }
From source file:aos.lucene.tools.ChainedFilterTest.java
License:Apache License
@Override public void setUp() throws Exception { directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(Version.LUCENE_46), IndexWriter.MaxFieldLength.UNLIMITED); Calendar cal = Calendar.getInstance(); cal.set(2009, 1, 1, 0, 0);//from w w w . java2 s .co m for (int i = 0; i < MAX; i++) { Document doc = new Document(); doc.add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); cal.add(Calendar.DATE, 1); } writer.close(); searcher = new IndexSearcher(directory); BooleanQuery bq = new BooleanQuery(); bq.add(new TermQuery(new Term("owner", "bob")), BooleanClause.Occur.SHOULD); bq.add(new TermQuery(new Term("owner", "sue")), BooleanClause.Occur.SHOULD); query = bq; cal.set(2099, 1, 1, 0, 0); dateFilter = TermRangeFilter.Less("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY));// C bobFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("owner", "bob")))); sueFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("owner", "sue")))); }
From source file:aos.lucene.tools.FastVectorHighlighterSample.java
License:Apache License
static void makeIndex() throws IOException { IndexWriter writer = new IndexWriter(dir, analyzer, true, MaxFieldLength.UNLIMITED); for (String d : DOCS) { Document doc = new Document(); doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); }//w ww .j av a 2s . c o m writer.close(); }
From source file:aos.lucene.tools.SpatialLuceneExample.java
License:Apache License
private void addLocation(IndexWriter writer, String name, double lat, double lng) throws IOException { Document doc = new Document(); doc.add(new Field("name", name, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat), // #A Field.Store.YES, Field.Index.NOT_ANALYZED)); // #A doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng), // #A Field.Store.YES, Field.Index.NOT_ANALYZED)); // #A doc.add(new Field("metafile", "doc", Field.Store.YES, Field.Index.ANALYZED)); IProjector projector = new SinusoidalProjector(); // #B int startTier = 5; // #C int endTier = 15; // #C for (; startTier <= endTier; startTier++) { CartesianTierPlotter ctp;/*from w w w. ja va 2 s.c o m*/ ctp = new CartesianTierPlotter(startTier, // #D projector, tierPrefix); // #D double boxId = ctp.getTierBoxId(lat, lng); // #D LOGGER.info("Adding field " + ctp.getTierFieldName() + ":" + boxId); doc.add(new Field(ctp.getTierFieldName(), NumericUtils // #E .doubleToPrefixCoded(boxId), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); } writer.addDocument(doc); LOGGER.info("===== Added Doc to index ===="); }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Indexes a single document and writes it to the given index writer * @param writer - the index writer to writer * @param metadata - the document// w w w . j a v a 2s . c o m * @throws IOException */ static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException { Path file = Paths.get(metadata.getFilename()); try { Document doc = new Document(); Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES); doc.add(pathField); // Add Document metadata // doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES)); doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES)); // End of Document Metadata // Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(), Field.Store.YES); doc.add(modified); PDFTextExtractor extractor = new PDFTextExtractor(); // Get the string contents String textContents = extractor.extractText(file.toString()); // Store the string contents FieldType contentsType = new FieldType(); contentsType.setStored(true); contentsType.setTokenized(true); contentsType.setStoreTermVectors(true); contentsType.setStoreTermVectorPositions(true); contentsType.setStoreTermVectorPayloads(true); contentsType.setStoreTermVectorOffsets(true); contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType); doc.add(contents); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): log.info("adding " + file + " to index"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: log.info("updating " + file + " in index"); writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc); } } catch (IOException e) { log.error("Failed to read file " + metadata.getFilename()); } }
From source file:aplicacion.sistema.indexer.test.IndexFiles.java
License:Apache License
static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); }//from ww w . j a va 2 s . c o m } } else { System.out.println("adding " + file); try { writer.addDocument(FileDocument.Document(file)); } // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help catch (FileNotFoundException fnfe) { ; } } } }
From source file:Application.mediaIndexer.java
/** * Indexes a single document//from ww w. ja v a 2 s. c o m * * @throws TikaException * @throws SAXException */ public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified) throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = Files.newInputStream(file)) { parser.parse(stream, handler, metadata); Document doc = new Document(); String[] metadataNames = metadata.names(); for (String name : metadataNames) doc.add(new TextField(name, metadata.get(name), Field.Store.YES)); doc.add(new StringField("path", file.toString(), Field.Store.YES)); doc.add(new LongPoint("modified", lastModified)); results.appendText("Title: " + metadata.get("title") + "\n"); results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n"); results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n"); results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n"); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can // be there): results.appendText("adding " + file + "\n"); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been // indexed): results.appendText("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:application.ReviewDocumentIndexer.java
License:Open Source License
/** * @param args/*ww w . jav a 2 s .c om*/ */ @SuppressWarnings("deprecation") public static void main(String[] args) { // Parse command line arguments. Exit program is provided arguments are insufficient ReviewDocumentIndexer indexer = new ReviewDocumentIndexer(args); if (indexer == null) return; // Open a new index IndexWriter index = null; try { index = new IndexWriter(new SimpleFSDirectory(new File(Paths.luceneIndex)), new ReviewTextAnalyzer(indexer), indexer.new_index ? true : false, MaxFieldLength.UNLIMITED); if (indexer.pause_every > 2) { index.setMaxBufferedDocs(indexer.pause_every); } index.setMaxMergeDocs(Config.maxMergeDocs); index.setMergeFactor(Config.mergeFactor); } catch (CorruptIndexException e) { AppLogger.error.log(Level.SEVERE, "Lucene detected an inconsistency upon opening the index located at " + Paths.luceneIndex); throw new RuntimeException("Exiting application", e); } catch (LockObtainFailedException e) { AppLogger.error.log(Level.SEVERE, "Index located at " + Paths.luceneIndex + " is already open by another Lucene process"); throw new RuntimeException("Exiting application", e); } catch (IOException e) { AppLogger.error.log(Level.SEVERE, "Could not access location " + Paths.luceneIndex); throw new RuntimeException("Exiting application", e); } // Load a number of reviews from database NumberFormat docIdFormat = TokenListsCollector.defaultDocIdFormat(); try { DatabaseReviewCollection reviews = new DatabaseReviewCollection(indexer.pause_every); reviews.setLimits(indexer.min_reviewid, indexer.stop_after); int indexed_counter = 0; while (reviews.hasNextSegment()) { System.out.print(Calendar.getInstance().getTime().toGMTString()); System.out.print(" Loading from DB... "); reviews.loadNextSegment(); Iterator<Review> reviewsIterator = reviews.getIterator(); System.out.print(" Indexing... "); while (reviewsIterator.hasNext()) { DatabaseReview dbr = (DatabaseReview) reviewsIterator.next(); int dbr_id = dbr.getReviewid(); int dbr_rating = dbr.getRating(); try { indexer.theReviewId.set(dbr_id); indexer.theStats.setCurrent(dbr_id, dbr_rating); index.addDocument(dbr.getDocumentForIndexing()); indexed_counter++; // Also, keep track of the rating and length of this review indexer.theStats.storeCurrent(); } catch (CorruptIndexException e) { AppLogger.error.log(Level.SEVERE, "Lucene detected an inconsistency upon saving review #" + Integer.toString(dbr.getReviewid()) + "to the index located at " + Paths.luceneIndex); return; } catch (IOException e) { AppLogger.error.log(Level.WARNING, "Review #" + Integer.toString(dbr.getReviewid()) + " could not be indexed"); } } // Backup everything System.out.print("Indexed " + indexed_counter + " reviews total. "); if (indexer.pause_every > 0) { System.out.print("Saving tokenlists... "); indexer.theTokenLists.writeNextFile(docIdFormat); System.out.print("Saving state... "); try { index.commit(); indexer.saveState(); } catch (CorruptIndexException e) { AppLogger.error.log(Level.SEVERE, "Committing index changes failed on review #" + indexer.theReviewId.get() + "due to CorruptIndexException"); return; } catch (IOException e) { AppLogger.error.log(Level.WARNING, "Committing index changes failed on review #" + indexer.theReviewId.get() + "due to IOException"); } } System.out.print("DONE\n"); reviews.reset(); } } catch (SQLException e) { AppLogger.error.log(Level.SEVERE, "An exception occured while trying to access the database.\n" + e.getMessage()); return; } try { index.close(); indexer.backupIndex(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.err.println("Indexing successfully completed!"); return; }
From source file:apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("i", null, true, "input file"); options.addOption("o", null, true, "output directory"); options.addOption("r", null, true, "optional output TREC-format QREL file"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "document source type: " + commaJoin.join(SourceFactory.getDocSourceList())); // If you increase this value, you may need to modify the following line in *.sh file // export MAVEN_OPTS="-Xms8192m -server" double ramBufferSizeMB = 1024 * 8; // 8 GB CommandLineParser parser = new org.apache.commons.cli.GnuParser(); IndexWriter indexWriter = null; BufferedWriter qrelWriter = null; int docNum = 0; try {/* w w w . j a va 2 s . c o m*/ CommandLine cmd = parser.parse(options, args); String inputFileName = null, outputDirName = null, qrelFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } if (cmd.hasOption("o")) { outputDirName = cmd.getOptionValue("o"); } else { Usage("Specify 'index directory'", options); } if (cmd.hasOption("r")) { qrelFileName = cmd.getOptionValue("r"); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); if (qrelFileName != null) qrelWriter = new BufferedWriter(new FileWriter(qrelFileName)); File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } boolean useFixedBM25 = cmd.hasOption("bm25fixed"); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } EnglishAnalyzer analyzer = new EnglishAnalyzer(); FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName)); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer); /* OpenMode.CREATE creates a new index or overwrites an existing one. https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE */ indexConf.setOpenMode(OpenMode.CREATE); indexConf.setRAMBufferSizeMB(ramBufferSizeMB); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b)); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); } indexWriter = new IndexWriter(indexDir, indexConf); DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName); DocumentEntry inpDoc = null; TextCleaner textCleaner = new TextCleaner(null); while ((inpDoc = inpDocSource.next()) != null) { ++docNum; Document luceneDoc = new Document(); ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText); String cleanText = spaceJoin.join(cleanedToks); // System.out.println(inpDoc.mDocId); // System.out.println(cleanText); // System.out.println("=============================="); luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES)); luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES)); indexWriter.addDocument(luceneDoc); if (inpDoc.mIsRel != null && qrelWriter != null) { saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0); } if (docNum % 1000 == 0) System.out.println(String.format("Indexed %d documents", docNum)); } } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments" + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } finally { System.out.println(String.format("Indexed %d documents", docNum)); try { if (null != indexWriter) indexWriter.close(); if (null != qrelWriter) qrelWriter.close(); } catch (IOException e) { System.err.println("IO exception: " + e); e.printStackTrace(); } } }
From source file:arena.lucene.LuceneIndexUpdater.java
License:Open Source License
public int updateIndex(boolean deleteAllFromIndexFirst, Iterable<T> valueobjects) { IndexWriter writer = null; try {/* w w w. ja va2 s. c om*/ writer = new IndexWriter(directoryBean.getDirectory(), analyzer, deleteAllFromIndexFirst, MaxFieldLength.LIMITED); int docCount = 0; for (T vo : valueobjects) { Term pkTerm = this.contentMarshall.getPKTerm(vo); writer.deleteDocuments(pkTerm); Document doc = this.contentMarshall.serialize(vo); if (doc != null) { writer.addDocument(doc); docCount++; } } if (this.searchersToReset != null) { for (LuceneIndexSearcher<?> searcher : this.searchersToReset) { searcher.reset(); } } return docCount; } catch (IOException err) { throw new RuntimeException("Error deleting documents from lucene index", err); } finally { if (writer != null) { try { writer.close(); } catch (IOException err) { } } } }