List of usage examples for org.apache.lucene.index PostingsEnum OFFSETS
short OFFSETS
To view the source code for org.apache.lucene.index PostingsEnum OFFSETS.
Click Source Link
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks the terms enum sequentially/*from w ww. ja v a2 s .c o m*/ * if deep is false, it does a 'shallow' test that doesnt go down to the docsenums */ public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep, boolean hasPositions) throws Exception { BytesRef term; PostingsEnum leftPositions = null; PostingsEnum rightPositions = null; PostingsEnum leftDocs = null; PostingsEnum rightDocs = null; while ((term = leftTermsEnum.next()) != null) { assertEquals(term, rightTermsEnum.next()); assertTermStats(leftTermsEnum, rightTermsEnum); if (deep) { if (hasPositions) { // with payloads + off assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL)); // with payloads only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS)); // with offsets only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS)); // with positions only assertDocsAndPositionsEnum( leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS)); assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS)); } // with freqs: assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs)); // w/o freqs: assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE)); // with freqs: assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs)); // w/o freqs: assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE)); } } assertNull(rightTermsEnum.next()); }
From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java
License:Open Source License
private void fillTermsOfInterest(LeafReader LeafReader, int luceneDoc, Map<Integer, TermInfo> termsOfInterest) throws IOException { // fill in terms of interest Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); if (postingsEnum != null) { postingsEnum.nextDoc();/*from w w w . j a va 2 s . c o m*/ for (int i = 0, len = postingsEnum.freq(); i < len; i++) { int pos = postingsEnum.nextPosition(); if (termsOfInterest.containsKey(pos)) { termsOfInterest.put(pos, new TermInfo(termString, postingsEnum.startOffset(), postingsEnum.endOffset(), pos, 1)); } } } } else { break; } } }
From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java
License:Open Source License
private SimplifiedTermInfo[] getSparseSimplifiedTermInfoArray(CorpusMapper corpusMapper, int luceneDoc, int lastTokenOffset) throws IOException { Keywords stopwords = this.getStopwords(corpusMapper.getCorpus()); Terms terms = corpusMapper.getLeafReader().getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); SimplifiedTermInfo[] simplifiedTermInfoArray = new SimplifiedTermInfo[lastTokenOffset + 1]; while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); //if (stopwords.isKeyword(termString)) {continue;} // treat as whitespace or punctuation PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); while (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postingsEnum.freq(); for (int i = 0, len = freq; i < len; i++) { int pos = postingsEnum.nextPosition(); new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(), postingsEnum.endOffset()); simplifiedTermInfoArray[pos] = freq > 1 ? new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(), postingsEnum.endOffset()) : new SimplifiedTermInfo(""); // empty string if not repeating }/*from ww w .java 2s. c o m*/ } } else { break; } } return simplifiedTermInfoArray; }
From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java
License:Open Source License
private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit); LeafReader reader = corpusMapper.getLeafReader(); Corpus corpus = corpusMapper.getCorpus(); CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType); TermsEnum termsEnum = null;/* w w w . j ava 2 s . c o m*/ Bits docIdBitSet = corpusMapper .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus)); Bits allBits = new Bits.MatchAllBits(reader.numDocs()); int[] tokenCounts = corpus.getTokensCounts(tokenType); float[] typesCountMeans = corpus.getTypesCountMeans(tokenType); float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType); for (int doc : corpusMapper.getLuceneIds()) { if (!docIdBitSet.get(doc)) { continue; } FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator, limit * docIdBitSet.length()); int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc); String docId = corpusMapper.getDocumentIdFromLuceneId(doc); float mean = typesCountMeans[documentPosition]; float stdDev = typesCountStdDev[documentPosition]; int totalTokensCount = tokenCounts[documentPosition]; Terms terms = reader.getTermVector(doc, tokenType.name()); if (terms != null) { termsEnum = terms.iterator(); if (termsEnum != null) { BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String termString = bytesRef.utf8ToString(); if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) { bytesRef = termsEnum.next(); continue; } if (!stopwords.isKeyword(termString)) { CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString); int[] positions = null; int[] offsets = null; int freq; if (isNeedsPositions || isNeedsOffsets) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); postingsEnum.nextDoc(); freq = postingsEnum.freq(); positions = new int[freq]; offsets = new int[freq]; for (int i = 0; i < freq; i++) { positions[i] = postingsEnum.nextPosition(); offsets[i] = postingsEnum.startOffset(); } } else { freq = (int) termsEnum.totalTermFreq(); } if (freq >= minRawFreq) { total++; float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN; DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString, freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal); docQueue.offer(documentTerm); } } bytesRef = termsEnum.next(); } } } int i = 0; for (DocumentTerm docTerm : docQueue.getOrderedList()) { queue.offer(docTerm); if (++i >= perDocLimit) { break; } } } corpusTermMinimalsDB.close(); this.terms.addAll(queue.getOrderedList(start)); }