List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID, Set<String> fieldsToLoad) throws IOException
From source file:com.o19s.solr.swan.highlight.TermVectorFun.java
License:Apache License
@Test public void testBlah() throws IOException { RAMDirectory ramDir = new RAMDirectory(); // Index some made up content IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47)); IndexWriter writer = new IndexWriter(ramDir, iwf); FieldType ft = new FieldType(); ft.setIndexed(true);//from ww w.java 2s. co m ft.setTokenized(true); ft.setStored(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.freeze(); for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); StringField id = new StringField("id", "doc_" + i, StringField.Store.YES); doc.add(id); // Store both position and offset information Field text = new Field("content", DOCS[i], ft); // Field.Index.ANALYZED, // Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(text); writer.addDocument(doc); } //writer.close(); // Get a searcher AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true)); IndexSearcher searcher = new IndexSearcher(dr); // Do a search using SpanQuery SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece")); TopDocs results = searcher.search(fleeceQ, 10); for (int i = 0; i < results.scoreDocs.length; i++) { ScoreDoc scoreDoc = results.scoreDocs[i]; System.out.println("Score Doc: " + scoreDoc); } IndexReader reader = searcher.getIndexReader(); Bits acceptDocs = null; Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>(); Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts); while (spans.next()) { System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end()); DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content"); reader.document(spans.doc(), visitor); Terms terms = reader.getTermVector(spans.doc(), "content"); TermsEnum tenum = terms.iterator(null); // AttributeSource as = tenum.attributes(); while (tenum.next() != null) { System.out.println(tenum.term().utf8ToString()); } for (long pos = 0L; pos < spans.end(); pos++) { // tenum.next(); // if (tenum.ord()<pos) continue; // System.out.println(tenum.term()); // } reader.document(spans.doc(), visitor); // String[] values = visitor.getDocument().getValues("content"); // List<String> a = new ArrayList<String>(); // // build up the window // tvm.start = spans.start() - window; // tvm.end = spans.end() + window; // reader.getTermFreqVector(spans.doc(), "content", tvm); // for (WindowEntry entry : tvm.entries.values()) { // System.out.println("Entry: " + entry); // } // // clear out the entries for the next round // tvm.entries.clear(); } }
From source file:demo.jaxrs.search.server.Catalog.java
License:Apache License
@GET @Produces(MediaType.APPLICATION_JSON)//from w w w . j a v a2 s. co m public JsonArray getBooks() throws IOException { final IndexReader reader = DirectoryReader.open(directory); final IndexSearcher searcher = new IndexSearcher(reader); final JsonArrayBuilder builder = Json.createArrayBuilder(); try { final Query query = new MatchAllDocsQuery(); for (final ScoreDoc scoreDoc : searcher.search(query, 1000).scoreDocs) { final DocumentStoredFieldVisitor fieldVisitor = new DocumentStoredFieldVisitor( LuceneDocumentMetadata.SOURCE_FIELD); reader.document(scoreDoc.doc, fieldVisitor); builder.add(fieldVisitor.getDocument().getField(LuceneDocumentMetadata.SOURCE_FIELD).stringValue()); } return builder.build(); } finally { reader.close(); } }
From source file:edu.stanford.muse.index.Indexer.java
License:Apache License
/** * sets up indexer just for reading... if needed for writing only, call * setupForWrite. if need both read & write, call both. *//* w ww .ja v a 2 s . co m*/ synchronized void setupForRead() { log.info("setting up index for read only access"); long startTime = System.currentTimeMillis(); //closeHandles(); try { setupDirectory(); String[] defaultSearchFields, defaultSearchFieldsOriginal; String[] defaultSearchFieldSubject = new String[] { "title" }; // for subject only search String[] defaultSearchFieldCorrespondents; //body field should be there, as the content of the attachment lies in this field, should also include meta field? //why the search over en-names and en-names-original when body/body_original is included in the search fields? defaultSearchFields = new String[] { "body", "title", "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" }; defaultSearchFieldsOriginal = new String[] { "body_original", "title" }; // we want to leave title there because we want to always hit the title -- discussed with Peter June 27 2015 defaultSearchFieldCorrespondents = new String[] { "to_names", "from_names", "cc_names", "bcc_names", "to_emails", "from_emails", "cc_emails", "bcc_emails" }; // names field added above after email discussion with Sit 6/11/2013. problem is that we're not using the Lucene EnglishPossessiveFilter, so // NER will extract the name Stanford University in a sentence like: // "This is Stanford University's website." // but when the user clicks on the name "Stanford University" in say monthly cards, we // will not match the message with this sentence because of the apostrophe. //for searching an attchment with fileName String[] metaSearchFields = new String[] { "fileName" }; // Parse a simple query that searches for "text": if (parser == null) { //parser = new QueryParser(MUSE_LUCENE_VERSION, defaultSearchField, analyzer); parser = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFields, analyzer); parserOriginal = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldsOriginal, analyzer); parserSubject = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldSubject, analyzer); parserCorrespondents = new MultiFieldQueryParser(LUCENE_VERSION, defaultSearchFieldCorrespondents, analyzer); parserMeta = new MultiFieldQueryParser(LUCENE_VERSION, metaSearchFields, new KeywordAnalyzer()); } /** * Bunch of gotchas here * Its a bad idea to store lucene internal docIds, as no assumptions about the internal docIds should be made; * not even that they are serial. When searching, lucene may ignore logically deleted docs. * Lucene does not handle deleted docs, and having these docs in search may bring down the search performance by 50% * Deleted docs are cleaned only during merging of indices.*/ int numContentDocs = 0, numContentDeletedDocs = 0, numAttachmentDocs = 0, numAttachmentDeletedDocs = 0; if (DirectoryReader.indexExists(directory)) { DirectoryReader ireader = DirectoryReader.open(directory); if (ireader.numDeletedDocs() > 0) log.warn("!!!!!!!\nIndex reader has " + ireader.numDocs() + " doc(s) of which " + ireader.numDeletedDocs() + " are deleted)\n!!!!!!!!!!"); isearcher = new IndexSearcher(ireader); contentDocIds = new LinkedHashMap<>(); numContentDocs = ireader.numDocs(); numContentDeletedDocs = ireader.numDeletedDocs(); Bits liveDocs = MultiFields.getLiveDocs(ireader); Set<String> fieldsToLoad = new HashSet<>(); fieldsToLoad.add("docId"); for (int i = 0; i < ireader.maxDoc(); i++) { org.apache.lucene.document.Document doc = ireader.document(i, fieldsToLoad); if (liveDocs != null && !liveDocs.get(i)) continue; if (doc == null || doc.get("docId") == null) continue; contentDocIds.put(i, doc.get("docId")); } log.info("Loaded: " + contentDocIds.size() + " content docs"); } if (DirectoryReader.indexExists(directory_blob)) { IndexReader ireader_blob = DirectoryReader.open(directory_blob); isearcher_blob = new IndexSearcher(ireader_blob); // read-only=true blobDocIds = new LinkedHashMap<Integer, String>(); numAttachmentDocs = ireader_blob.numDocs(); numAttachmentDeletedDocs = ireader_blob.numDeletedDocs(); Bits liveDocs = MultiFields.getLiveDocs(ireader_blob); Set<String> fieldsToLoad = new HashSet<String>(); fieldsToLoad.add("docId"); for (int i = 0; i < ireader_blob.maxDoc(); i++) { org.apache.lucene.document.Document doc = ireader_blob.document(i, fieldsToLoad); if (liveDocs != null && !liveDocs.get(i)) continue; if (doc == null || doc.get("docId") == null) continue; blobDocIds.put(i, doc.get("docId")); } log.info("Loaded: " + blobDocIds.size() + " attachment docs"); } log.warn("Number of content docs: " + numContentDocs + ", number deleted: " + numContentDeletedDocs); log.warn("Number of attachment docs: " + numAttachmentDocs + ", number deleted: " + numAttachmentDeletedDocs); if (dirNameToDocIdMap == null) dirNameToDocIdMap = new LinkedHashMap<String, Map<Integer, String>>(); } catch (Exception e) { Util.print_exception(e, log); } log.info("Setting up index for read took " + (System.currentTimeMillis() - startTime) + " ms"); }
From source file:eu.eexcess.sourceselection.redde.Redde.java
License:Apache License
private LinkedList<ScoreDoc> filterDocumentsFromSource(DatabaseDetails databaseDetails, ScoreDoc[] rankedCentralizedSampleDocuments, IndexReader reader) throws IOException { LinkedList<ScoreDoc> filtered = new LinkedList<ScoreDoc>(); Set<String> fields = new HashSet<String>(); fields.add(Settings.IndexFields.IndexNameField); for (ScoreDoc scoreDocument : rankedCentralizedSampleDocuments) { Document document = reader.document(scoreDocument.doc, fields); if (document.getField(Settings.IndexFields.IndexNameField).stringValue() .compareTo(databaseDetails.indexName) == 0) { filtered.add(scoreDocument); }//from w ww .j a v a 2 s . c o m } return filtered; }
From source file:fi.semantum.strategia.Lucene.java
License:Open Source License
public static synchronized List<String> search(String databaseId, String search) throws IOException { ArrayList<String> result = new ArrayList<String>(); IndexReader reader = null; try {/* w w w .j a v a2 s. co m*/ reader = DirectoryReader.open(getDirectory(databaseId)); IndexSearcher searcher = new IndexSearcher(reader); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "text", getAnalyzer()); parser.setAllowLeadingWildcard(true); Query query = parser.parse(search); TopDocs docs = searcher.search(query, Integer.MAX_VALUE); for (ScoreDoc scoreDoc : docs.scoreDocs) { try { DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor(); reader.document(scoreDoc.doc, visitor); Document doc = visitor.getDocument(); result.add(doc.get("uuid")); } catch (CorruptIndexException e) { throw new IOException(e); } } } catch (ParseException e) { throw new IOException(e); } finally { if (reader != null) reader.close(); } return result; }
From source file:lucenetools.TermData.java
License:Apache License
/** * Main application. /*from ww w . jav a 2 s. c om*/ * * @param args the command line arguments */ public static void main(String[] args) { Options opts = new Options(); CommandLine commandLine = new CommandLine(); // if no command line options specified, user wants help if (0 == args.length) { commandLine.showHelp(); System.exit(0); } // extract command line args and store in opts if (!commandLine.parse(args, opts)) System.exit(1); if (opts.showHelp) { commandLine.showHelp(); System.exit(0); } // validate all command line options if (!commandLine.isValid(opts)) System.exit(1); // report all command line options to the user System.out.println("\nLuceneToMtx version " + VERSION + "."); commandLine.printOpts(opts); long maxMemory = Runtime.getRuntime().maxMemory() / 1024 / 1024; System.out.println("Java runtime max memory: " + maxMemory + " MB."); // Build a map and assign a dictionary index to each term. // Include only those terms that survive the min term freq cutoff. Map<String, Integer> dictMap = new TreeMap<>(); File file = null; System.out.println("Processing index..."); try { file = new File(opts.indexDir); IndexReader reader = DirectoryReader.open(FSDirectory.open(file)); TermsEnum te = null; int nnz = 0, numCols = 0, maxDocs = reader.maxDoc(); LinkedList<FeatureVector> matrixData = new LinkedList<>(); // add other fields Collection<String> fields = new ArrayList<>(); if (opts.fields > 0) { fields = MultiFields.getIndexedFields(reader); fields.remove(CONTENTSFIELD); fields.remove(PATHFIELD); } if (!extractTerms(reader, dictMap, opts.minTermFreq, maxDocs - 1, opts.maxTermPercentage)) System.exit(1); // set of field names to extract Set<String> fieldSet = new HashSet<>(); fieldSet.add(PATHFIELD); for (String s : fields) { fieldSet.add(s); } for (int i = 0; i < maxDocs; ++i) { // get term vector for next document Terms terms = reader.getTermVector(i, CONTENTSFIELD); if (terms == null) continue; te = terms.iterator(te); FeatureVector fv = new FeatureVector(numCols); int numEntries = buildFeatureVector(fv, te, dictMap); if (numEntries > 0) { // extract document path and save with FeatureVector Document doc = reader.document(i, fieldSet); fv.docPath = doc.get(PATHFIELD); // add any additional fields for (String s : fields) { fv.fields.put(s, doc.get(s)); } //System.out.println("processing document:" + fv.docPath); matrixData.add(fv); nnz += numEntries; ++numCols; } } // Sort the feature vectors by their document path field. Write // the matrix columns in this sorted order. Collections.sort(matrixData, new FeatureVectorComparator()); File outdir = new File(opts.outDir); writeMatrixMarketFile(new File(outdir, MATRIXFILE), matrixData, dictMap.size(), numCols, nnz); System.out.println("Wrote " + MATRIXFILE + "."); writeDictionaryFile(new File(outdir, DICTFILE), dictMap); System.out.println("Wrote " + DICTFILE + "."); writeDocumentFile(new File(outdir, DOCFILE), matrixData); System.out.println("Wrote " + DOCFILE + "."); writeFieldFiles(outdir, fields, matrixData); } catch (IndexNotFoundException e) { if (null != file) { System.out.println("Lucene index not found in: " + file.getAbsolutePath()); } } catch (IOException e) { System.out.println("LuceneToMtx exception: caught a " + e.getClass() + "\nMessage: " + e.getMessage()); } }
From source file:lux.CachingDocReader.java
License:Mozilla Public License
private XdmNode get(int docID, int luceneDocID, IndexReader reader) throws IOException { XdmNode node = cache.get(docID);/*from www. j a v a2s . c o m*/ if (node != null) { ++cacheHits; return node; } DocumentStoredFieldVisitor fieldSelector = new DocumentStoredFieldVisitor(); reader.document(luceneDocID, fieldSelector); Document document = fieldSelector.getDocument(); return getXdmNode(docID, document); }
From source file:net.tourbook.search.FTSearchManager.java
License:Open Source License
/** * Creating the result is complicated because the highlights are listed by field and not by hit, * therefor the structure must be inverted. * /*from ww w . java2 s . co m*/ * @param highlights * @param indexReader * @param searchResult * @param docStartIndex * @param docids2 * @throws IOException */ private static void search_CreateResult(final Map<String, String[]> highlights, final IndexReader indexReader, final SearchResult searchResult, final int[] docids, final int docStartIndex) throws IOException { if (highlights.size() == 0) { return; } final Set<Entry<String, String[]>> fields = highlights.entrySet(); Entry<String, String[]> firstHit; try { firstHit = fields.iterator().next(); } catch (final Exception e) { return; } final int numberOfHits = firstHit.getValue().length; // create result items final SearchResultItem[] resultItems = new SearchResultItem[numberOfHits]; final ArrayList<SearchResultItem> searchResultItems = searchResult.items; for (int hitIndex = 0; hitIndex < numberOfHits; hitIndex++) { final SearchResultItem resultItem = new SearchResultItem(); resultItems[hitIndex] = resultItem; searchResultItems.add(resultItem); } boolean isDocRead = false; final Set<String> fieldsToLoad = new HashSet<String>(); fieldsToLoad.add(SEARCH_FIELD_DOC_SOURCE); fieldsToLoad.add(SEARCH_FIELD_TOUR_ID); fieldsToLoad.add(SEARCH_FIELD_MARKER_ID); fieldsToLoad.add(SEARCH_FIELD_TIME); for (final Entry<String, String[]> field : fields) { final String fieldName = field.getKey(); final String[] snippets = field.getValue(); for (int hitIndex = 0; hitIndex < snippets.length; hitIndex++) { final SearchResultItem resultItem = resultItems[hitIndex]; final String snippet = snippets[hitIndex]; switch (fieldName) { case SEARCH_FIELD_DESCRIPTION: resultItem.description = snippet; break; case SEARCH_FIELD_TITLE: resultItem.title = snippet; break; } if (isDocRead == false) { final int docId = docids[hitIndex]; final Document doc = indexReader.document(docId, fieldsToLoad); resultItem.docId = docId; // resultItem.score = scoreDocs[docStartIndex + hitIndex].score; for (final IndexableField indexField : doc.getFields()) { final String docFieldName = indexField.name(); switch (docFieldName) { case SEARCH_FIELD_DOC_SOURCE: resultItem.docSource = indexField.numericValue().intValue(); break; case SEARCH_FIELD_TOUR_ID: resultItem.tourId = indexField.stringValue(); break; case SEARCH_FIELD_MARKER_ID: resultItem.markerId = indexField.stringValue(); break; case SEARCH_FIELD_TIME: resultItem.tourStartTime = indexField.numericValue().longValue(); break; } } } } // get doc fields only once isDocRead = true; } }
From source file:nl.strohalm.cyclos.utils.lucene.LuceneQueryHandler.java
License:Open Source License
public <E extends Entity & Indexable> E toEntity(final IndexReader reader, final int docId, final Class<E> entityType, final Relationship... fetch) { try {/*from ww w . j a v a 2s . c o m*/ Document doc = reader.document(docId, IdFieldSelector.getInstance()); long id = Long.parseLong(doc.get("id")); E entity = EntityHelper.reference(entityType, id); entity = fetchDao.fetch(entity, fetch); return entity; } catch (EntityNotFoundException e) { return null; } catch (Exception e) { throw new DaoException(e); } }
From source file:org.apache.blur.manager.IndexManager.java
License:Apache License
public static void fetchRow(IndexReader reader, String table, String shard, Selector selector, FetchResult fetchResult, Query highlightQuery, FieldManager fieldManager, int maxHeap, TableContext tableContext, Filter filter) throws CorruptIndexException, IOException { try {//from w w w . j a va 2 s . c o m fetchResult.table = table; String locationId = selector.locationId; int lastSlash = locationId.lastIndexOf('/'); int docId = Integer.parseInt(locationId.substring(lastSlash + 1)); if (docId >= reader.maxDoc()) { throw new RuntimeException( "Location id [" + locationId + "] with docId [" + docId + "] is not valid."); } boolean returnIdsOnly = false; if (selector.columnFamiliesToFetch != null && selector.columnsToFetch != null && selector.columnFamiliesToFetch.isEmpty() && selector.columnsToFetch.isEmpty()) { // exit early returnIdsOnly = true; } Tracer t1 = Trace.trace("fetchRow - live docs"); Bits liveDocs = MultiFields.getLiveDocs(reader); t1.done(); ResetableDocumentStoredFieldVisitor fieldVisitor = getFieldSelector(selector); if (selector.isRecordOnly()) { // select only the row for the given data or location id. if (isFiltered(docId, reader, filter)) { fetchResult.exists = false; fetchResult.deleted = false; return; } else if (liveDocs != null && !liveDocs.get(docId)) { fetchResult.exists = false; fetchResult.deleted = true; return; } else { fetchResult.exists = true; fetchResult.deleted = false; reader.document(docId, fieldVisitor); Document document = fieldVisitor.getDocument(); if (highlightQuery != null && fieldManager != null) { HighlightOptions highlightOptions = selector.getHighlightOptions(); String preTag = highlightOptions.getPreTag(); String postTag = highlightOptions.getPostTag(); try { document = HighlightHelper.highlight(docId, document, highlightQuery, fieldManager, reader, preTag, postTag); } catch (InvalidTokenOffsetsException e) { LOG.error("Unknown error while tring to highlight", e); } } fieldVisitor.reset(); fetchResult.recordResult = getRecord(document); return; } } else { Tracer trace = Trace.trace("fetchRow - Row read"); try { if (liveDocs != null && !liveDocs.get(docId)) { fetchResult.exists = false; fetchResult.deleted = true; return; } else { fetchResult.exists = true; fetchResult.deleted = false; if (returnIdsOnly) { String rowId = selector.getRowId(); if (rowId == null) { rowId = getRowId(reader, docId); } fetchResult.rowResult = new FetchRowResult(); fetchResult.rowResult.row = new Row(rowId, null); } else { List<Document> docs; AtomicBoolean moreDocsToFetch = new AtomicBoolean(false); AtomicInteger totalRecords = new AtomicInteger(); BlurHighlighter highlighter = new BlurHighlighter(highlightQuery, fieldManager, selector); Tracer docTrace = Trace.trace("fetchRow - Document read"); docs = BlurUtil.fetchDocuments(reader, fieldVisitor, selector, maxHeap, table + "/" + shard, tableContext.getDefaultPrimeDocTerm(), filter, moreDocsToFetch, totalRecords, highlighter); docTrace.done(); Tracer rowTrace = Trace.trace("fetchRow - Row create"); Row row = getRow(docs); if (row == null) { String rowId = selector.getRowId(); if (rowId == null) { rowId = getRowId(reader, docId); } row = new Row(rowId, null); } fetchResult.rowResult = new FetchRowResult(row, selector.getStartRecord(), selector.getMaxRecordsToFetch(), moreDocsToFetch.get(), totalRecords.get()); rowTrace.done(); } return; } } finally { trace.done(); } } } finally { if (fetchResult.rowResult != null) { if (fetchResult.rowResult.row != null && fetchResult.rowResult.row.records != null) { _readRecordsMeter.mark(fetchResult.rowResult.row.records.size()); } _readRowMeter.mark(); } else if (fetchResult.recordResult != null) { _readRecordsMeter.mark(); } } }