List of usage examples for org.apache.lucene.index IndexReader getTermVectors
public abstract Fields getTermVectors(int docID) throws IOException;
From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java
License:Apache License
/** * a constructor.//from www. j a va 2s . com * * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fieldQuery FieldQuery object * @throws IOException If there is a low-level I/O error */ public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName, final SpanAwareFieldQuery fieldQuery) throws IOException { this.fieldName = fieldName; Set<String> termSet = fieldQuery.getTermSet(fieldName); Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) return; final Fields vectors = reader.getTermVectors(docId); if (vectors == null) { // null snippet return; } final Terms vector = vectors.terms(fieldName); if (vector == null) { // null snippet return; } final CharsRef spare = new CharsRef(); final TermsEnum termsEnum = vector.iterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.maxDoc(); while ((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); final String term = spare.toString(); if (!termSet.contains(term)) { continue; } dpEnum = termsEnum.docsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.nextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html final float weight = (float) (Math .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0); final int freq = dpEnum.freq(); for (int i = 0; i < freq; i++) { int pos = dpEnum.nextPosition(); if (dpEnum.startOffset() < 0) { return; // no offsets, null snippet } if (alwaysHighlightTermSet.contains(term) || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) { termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight)); } } } // sort by position Collections.sort(termList); }
From source file:it.cnr.ilc.lc.clavius.search.Tester.java
private static void searchWithContext(String term) { try {//from w ww.java 2s . c om logger.info("searchWithContext(" + term + ")"); SpanQuery spanQuery = new SpanTermQuery(new Term("content", term)); Directory indexDirectory = FSDirectory.open( Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText")); DirectoryReader indexReader = DirectoryReader.open(indexDirectory); IndexSearcher searcher = new IndexSearcher(indexReader); IndexReader reader = searcher.getIndexReader(); //spanQuery = (SpanQuery) spanQuery.rewrite(reader); //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false); Spans spans = spanQuery.createWeight(searcher, false) .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS); // Spans spans2 = weight.getSpans(reader.leaves().get(0), // SpanWeight.Postings.OFFSETS); //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS); ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs; logger.info("hits :" + sc.length); int i; if (null != spans) { // while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) { for (int k = 0; k < sc.length; k++) { int docId = sc[k].doc; logger.info("docID: " + docId); int newDocID = spans.advance(docId); logger.info("newDocID: " + newDocID); int nextSpan = -1; while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) { logger.info("nextSpan : " + nextSpan); logger.info("spans.startPosition(): " + spans.startPosition()); logger.info("spans.endPosition() : " + spans.endPosition()); logger.info("spans.width() : " + spans.width()); Fields fields = reader.getTermVectors(docId); Terms terms = fields.terms("content"); TermsEnum termsEnum = terms.iterator(); BytesRef text; PostingsEnum postingEnum = null; int start = spans.startPosition() - 3; int end = spans.endPosition() + 3; while ((text = termsEnum.next()) != null) { //could store the BytesRef here, but String is easier for this example String s = new String(text.bytes, text.offset, text.length); // DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null); postingEnum = termsEnum.postings(postingEnum); if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { i = 0; int position = -1; while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) { if (position >= start && position <= end) { logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset + " length: " + text.length); } i++; } } } } } } else { logger.info("no " + term + " found!"); } } catch (IOException e) { logger.error(e.getMessage()); } logger.info("End."); }
From source file:org.apache.solr.handler.component.AlfrescoSolrHighlighter.java
License:Open Source License
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */ @SuppressWarnings("unchecked") protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException { final SolrParams params = req.getParams(); final String fieldName = schemaField.getName(); final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1); // Technically this is the max *fragments* (snippets), not max values: int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE); if (mvToExamine <= 0 || mvToMatch <= 0) { return null; }/*ww w . j av a 2 s .c o m*/ int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS); if (maxCharsToAnalyze < 0) {//e.g. -1 maxCharsToAnalyze = Integer.MAX_VALUE; } List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req); if (fieldValues.isEmpty()) { return null; } // preserve order of values in a multiValued list boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false); int numFragments = getMaxSnippets(fieldName, params); boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params); List<TextFragment> frags = new ArrayList<>(); //Try term vectors, which is faster // note: offsets are minimally sufficient for this HL. final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null; final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1); // We need to wrap in OffsetWindowTokenFilter if multi-valued final OffsetWindowTokenFilter tvWindowStream; if (tvStream != null && fieldValues.size() > 1) { tvWindowStream = new OffsetWindowTokenFilter(tvStream); } else { tvWindowStream = null; } for (String thisText : fieldValues) { if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) { break; } TokenStream tstream; if (tvWindowStream != null) { // if we have a multi-valued field with term vectors, then get the next offset window tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length()); } else if (tvStream != null) { tstream = tvStream; // single-valued with term vectors } else { // fall back to analyzer tstream = createAnalyzerTStream(schemaField, thisText); } Highlighter highlighter; if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) { // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream // needs to implement reset() efficiently. //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary. // It should be okay if OffsetLimit won't get applied in this case. final TokenStream tempTokenStream; if (tstream != tvStream) { if (maxCharsToAnalyze >= thisText.length()) { tempTokenStream = new CachingTokenFilter(tstream); } else { tempTokenStream = new CachingTokenFilter( new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze)); } } else { tempTokenStream = tstream; } // get highlighter highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream); // if the CachingTokenFilter was consumed then use it going forward. if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) { tstream = tempTokenStream; } //tstream.reset(); not needed; getBestTextFragments will reset it. } else { // use "the old way" highlighter = getHighlighter(query, fieldName, req); } highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze); maxCharsToAnalyze -= thisText.length(); // Highlight! try { TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments); for (TextFragment bestTextFragment : bestTextFragments) { if (bestTextFragment == null)//can happen via mergeContiguousFragments continue; // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless. if (bestTextFragment.getScore() > 0 || preserveMulti) { frags.add(bestTextFragment); if (bestTextFragment.getScore() > 0) --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values } } } catch (InvalidTokenOffsetsException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } } //end field value loop // Put the fragments onto the Solr response (docSummaries) if (frags.size() > 0) { // sort such that the fragments with the highest score come first if (!preserveMulti) { Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore())); } // Truncate list to hl.snippets, but not when hl.preserveMulti if (frags.size() > numFragments && !preserveMulti) { frags = frags.subList(0, numFragments); } return getResponseForFragments(frags, req); } return null;//no highlights for this field }
From source file:org.apache.solr.handler.component.TermVectorComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return;// w w w . j av a2 s .com } NamedList<Object> termVectors = new NamedList<Object>(); rb.rsp.add(TERM_VECTORS, termVectors); IndexSchema schema = rb.req.getSchema(); SchemaField keyField = schema.getUniqueKeyField(); String uniqFieldName = null; if (keyField != null) { uniqFieldName = keyField.getName(); termVectors.add("uniqueKeyFieldName", uniqFieldName); } FieldOptions allFields = new FieldOptions(); //figure out what options we have, and try to get the appropriate vector allFields.termFreq = params.getBool(TermVectorParams.TF, false); allFields.positions = params.getBool(TermVectorParams.POSITIONS, false); allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false); allFields.docFreq = params.getBool(TermVectorParams.DF, false); allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false); //boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); //short cut to all values. if (params.getBool(TermVectorParams.ALL, false)) { allFields.termFreq = true; allFields.positions = true; allFields.offsets = true; allFields.docFreq = true; allFields.tfIdf = true; } //Build up our per field mapping Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>(); NamedList<List<String>> warnings = new NamedList<List<String>>(); List<String> noTV = new ArrayList<String>(); List<String> noPos = new ArrayList<String>(); List<String> noOff = new ArrayList<String>(); Set<String> fields = getFields(rb); if (null != fields) { //we have specific fields to retrieve, or no fields for (String field : fields) { // workarround SOLR-3523 if (null == field || "score".equals(field)) continue; // we don't want to issue warnings about the uniqueKey field // since it can cause lots of confusion in distributed requests // where the uniqueKey field is injected into the fl for merging final boolean fieldIsUniqueKey = field.equals(uniqFieldName); SchemaField sf = schema.getFieldOrNull(field); if (sf != null) { if (sf.storeTermVector()) { FieldOptions option = fieldOptions.get(field); if (option == null) { option = new FieldOptions(); option.fieldName = field; fieldOptions.put(field, option); } //get the per field mappings option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq); option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq); option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf); //Validate these are even an option option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions); if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey) { noPos.add(field); } option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets); if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey) { noOff.add(field); } } else {//field doesn't have term vectors if (!fieldIsUniqueKey) noTV.add(field); } } else { //field doesn't exist throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field); } } } //else, deal with all fields // NOTE: currently all typs of warnings are schema driven, and garunteed // to be consistent across all shards - if additional types of warnings // are added that might be differnet between shards, finishStage() needs // to be changed to account for that. boolean hasWarnings = false; if (!noTV.isEmpty()) { warnings.add("noTermVectors", noTV); hasWarnings = true; } if (!noPos.isEmpty()) { warnings.add("noPositions", noPos); hasWarnings = true; } if (!noOff.isEmpty()) { warnings.add("noOffsets", noOff); hasWarnings = true; } if (hasWarnings) { termVectors.add("warnings", warnings); } DocListAndSet listAndSet = rb.getResults(); List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); Iterator<Integer> iter; if (docIds != null && !docIds.isEmpty()) { iter = docIds.iterator(); } else { DocList list = listAndSet.docList; iter = list.iterator(); } SolrIndexSearcher searcher = rb.req.getSearcher(); IndexReader reader = searcher.getIndexReader(); //the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors //Only load the id field to get the uniqueKey of that //field final String finalUniqFieldName = uniqFieldName; final List<String> uniqValues = new ArrayList<String>(); // TODO: is this required to be single-valued? if so, we should STOP // once we find it... final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() { @Override public void stringField(FieldInfo fieldInfo, String value) { uniqValues.add(value); } @Override public void intField(FieldInfo fieldInfo, int value) { uniqValues.add(Integer.toString(value)); } @Override public void longField(FieldInfo fieldInfo, long value) { uniqValues.add(Long.toString(value)); } @Override public Status needsField(FieldInfo fieldInfo) { return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO; } }; TermsEnum termsEnum = null; while (iter.hasNext()) { Integer docId = iter.next(); NamedList<Object> docNL = new NamedList<Object>(); if (keyField != null) { reader.document(docId, getUniqValue); String uniqVal = null; if (uniqValues.size() != 0) { uniqVal = uniqValues.get(0); uniqValues.clear(); docNL.add("uniqueKey", uniqVal); termVectors.add(uniqVal, docNL); } } else { // support for schemas w/o a unique key, termVectors.add("doc-" + docId, docNL); } if (null != fields) { for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) { final String field = entry.getKey(); final Terms vector = reader.getTermVector(docId, field); if (vector != null) { termsEnum = vector.iterator(termsEnum); mapOneVector(docNL, entry.getValue(), reader, docId, vector.iterator(termsEnum), field); } } } else { // extract all fields final Fields vectors = reader.getTermVectors(docId); for (String field : vectors) { Terms terms = vectors.terms(field); if (terms != null) { termsEnum = terms.iterator(termsEnum); mapOneVector(docNL, allFields, reader, docId, termsEnum, field); } } } } }
From source file:org.eclipse.che.api.search.server.impl.LuceneSearcher.java
License:Open Source License
@Override public SearchResult search(QueryExpression query) throws InvalidQueryException, QueryExecutionException { IndexSearcher luceneSearcher = null; try {//from w w w.j av a2s . c o m final long startTime = System.currentTimeMillis(); searcherManager.maybeRefresh(); luceneSearcher = searcherManager.acquire(); Query luceneQuery = createLuceneQuery(query); ScoreDoc after = null; final int numSkipDocs = Math.max(0, query.getSkipCount()); if (numSkipDocs > 0) { after = skipScoreDocs(luceneSearcher, luceneQuery, numSkipDocs); } final int numDocs = query.getMaxItems() > 0 ? Math.min(query.getMaxItems(), RESULT_LIMIT) : RESULT_LIMIT; TopDocs topDocs = luceneSearcher.searchAfter(after, luceneQuery, numDocs, sort, true, true); final long totalHitsNum = topDocs.totalHits; List<SearchResultEntry> results = newArrayList(); List<OffsetData> offsetData = Collections.emptyList(); for (int i = 0; i < topDocs.scoreDocs.length; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docId = scoreDoc.doc; Document doc = luceneSearcher.doc(docId); if (query.isIncludePositions()) { offsetData = new ArrayList<>(); String txt = doc.get(TEXT_FIELD); if (txt != null) { IndexReader reader = luceneSearcher.getIndexReader(); TokenStream tokenStream = TokenSources.getTokenStream(TEXT_FIELD, reader.getTermVectors(docId), txt, luceneIndexWriter.getAnalyzer(), -1); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); QueryScorer queryScorer = new QueryScorer(luceneQuery); // TODO think about this constant queryScorer.setMaxDocCharsToAnalyze(1_000_000); TokenStream newStream = queryScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } queryScorer.startFragment(null); tokenStream.reset(); int startOffset, endOffset; // TODO think about this constant for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < 1_000_000); next = tokenStream.incrementToken()) { startOffset = offsetAtt.startOffset(); endOffset = offsetAtt.endOffset(); if ((endOffset > txt.length()) || (startOffset > txt.length())) { throw new QueryExecutionException("Token " + termAtt.toString() + " exceeds length of provided text size " + txt.length()); } float res = queryScorer.getTokenScore(); if (res > 0.0F && startOffset <= endOffset) { String tokenText = txt.substring(startOffset, endOffset); Scanner sc = new Scanner(txt); int lineNum = 1; long len = 0; String foundLine = ""; while (sc.hasNextLine()) { foundLine = sc.nextLine(); len += foundLine.length(); if (len > startOffset) { break; } lineNum++; } offsetData.add( new OffsetData(tokenText, startOffset, endOffset, res, lineNum, foundLine)); } } } } String filePath = doc.getField(PATH_FIELD).stringValue(); LOG.debug("Doc {} path {} score {} ", docId, filePath, scoreDoc.score); results.add(new SearchResultEntry(filePath, offsetData)); } final long elapsedTimeMillis = System.currentTimeMillis() - startTime; boolean hasMoreToRetrieve = numSkipDocs + topDocs.scoreDocs.length + 1 < totalHitsNum; QueryExpression nextPageQueryExpression = null; if (hasMoreToRetrieve) { nextPageQueryExpression = createNextPageQuery(query, numSkipDocs + topDocs.scoreDocs.length); } return SearchResult.aSearchResult().withResults(results).withTotalHits(totalHitsNum) .withNextPageQueryExpression(nextPageQueryExpression).withElapsedTimeMillis(elapsedTimeMillis) .build(); } catch (ParseException e) { throw new InvalidQueryException(e.getMessage(), e); } catch (IOException e) { throw new QueryExecutionException(e.getMessage(), e); } finally { try { searcherManager.release(luceneSearcher); } catch (IOException e) { LOG.error(e.getMessage()); } } }
From source file:org.meresco.lucene.Lucene.java
License:Open Source License
public LuceneResponse similarDocuments(String identifier) throws Throwable { SearcherAndTaxonomy reference = data.getManager().acquire(); try {// w w w . j a v a2 s . c o m Query idQuery = new TermQuery(new Term(ID_FIELD, identifier)); TopDocs topDocs = reference.searcher.search(idQuery, 1); if (topDocs.totalHits == 0) return new LuceneResponse(0); int docId = topDocs.scoreDocs[0].doc; IndexReader reader = reference.searcher.getIndexReader(); CommonTermsQuery commonQuery = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, 0.1f); Fields termVectors = reader.getTermVectors(docId); if (termVectors == null) return new LuceneResponse(0); for (String field : termVectors) { TermsEnum iterator = termVectors.terms(field).iterator(null); BytesRef b; while ((b = iterator.next()) != null) { Term term = new Term(field, b.utf8ToString()); commonQuery.add(term); } } BooleanQuery query = new BooleanQuery(); query.add(idQuery, Occur.MUST_NOT); query.add(commonQuery, Occur.MUST); return executeQuery(query); } finally { data.getManager().release(reference); } }
From source file:pretraga.IsolationSimilarity.java
public void test(String vec) { List<String> vector = processInput(vec); HashMap<String, Long> map = new HashMap<>(); try {//from w ww.j a va2s .c o m Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); List<Integer> docId = getDocumentsFromVector(vector, reader, searcher); for (int i = 0; i < docId.size(); i++) { Fields ff = reader.getTermVectors(docId.get(i)); Terms terms = ff.terms(CONTENT); TermsEnum te = terms.iterator(); Object tmp = te.next(); while (tmp != null) { BytesRef by = (BytesRef) tmp; String term = by.utf8ToString(); ClassicSimilarity sim = null; if (searcher.getSimilarity(true) instanceof ClassicSimilarity) { sim = (ClassicSimilarity) searcher.getSimilarity(true); } float idf = sim.idf(te.docFreq(), reader.maxDoc()); float tf = sim.tf(te.totalTermFreq()); //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq()); TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq()); CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(), terms.getSumTotalTermFreq(), terms.getSumDocFreq()); Document d = reader.document(docId.get(i)); if (vector.contains(term)) { float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0)) .score(docId.get(i), te.totalTermFreq()); System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term); } tmp = te.next(); } /*Iterator<String> ss = ff.iterator(); while (ss.hasNext()) { String fieldString = ss.next(); System.out.println(fieldString); }*/ } } catch (Exception e) { } }