List of usage examples for org.apache.lucene.index LeafReader getTermVector
public final Terms getTermVector(int docID, String field) throws IOException
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException { TermsEnum cte = textTerms.iterator(); // get the doc term vectors Terms terms = leafReader.getTermVector(docId, textFieldName); if (terms == null) { throw new IOException("term vectors must be stored for field " + textFieldName); }/*from w w w . ja v a 2 s.com*/ TermsEnum termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { cte.seekExact(term); if (assignedClass != null) { long termFreqLocal = termsEnum.totalTermFreq(); // update weights Long previousValue = Util.get(fst, term); String termString = term.utf8ToString(); weights.put(termString, previousValue + modifier * termFreqLocal); } } if (updateFST) { updateFST(weights); } }
From source file:main.BM25VASimilarity.java
License:Apache License
@Override public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException { BM25Stats bm25stats = (BM25Stats) stats; LeafReader reader = context.reader(); //int docCount = reader.getDocCount(bm25stats.field); //BVA calculated for each document float[] BVA = new float[reader.maxDoc()]; float sumOfAverageTermFrequencies = 0.0f; //length of each doc float[] Ld = new float[reader.maxDoc()]; //the number of unique terms in the doc. float[] Td = new float[reader.maxDoc()]; NumericDocValues norms = reader.getNormValues(bm25stats.field); // int nulldocs = 0; for (int i = 0; i < reader.maxDoc(); i++) { Terms terms = reader.getTermVector(i, bm25stats.field); //norm should be the decoded length of doc d, Ld. float norm = norms == null ? k1 : bm25stats.cache[(byte) norms.get(i) & 0xFF]; Ld[i] = norm;/*from w ww .jav a2s. c om*/ //using terms.size() returns Td, the number of unique terms in the doc. Td[i] = terms.size(); // if (terms == null) { // nulldocs++; // continue; // } float averageTermFrequency = Ld[i] / Td[i]; sumOfAverageTermFrequencies += averageTermFrequency; } //calculate mean average term frequency of all documents float mavgtf = sumOfAverageTermFrequencies / reader.maxDoc(); //calculate B_VA for each document for (int i = 0; i < reader.maxDoc(); i++) { BVA[i] = 1 / (mavgtf * mavgtf) * Ld[i] / Td[i] + (1 - 1 / mavgtf) * Ld[i] / bm25stats.avgdl; } // System.out.println("Null docs: "+nulldocs); // System.out.println("Max docs: "+reader.maxDoc()); // System.out.println("Doc count: "+reader.getDocCount(bm25stats.field)); // System.out.println("max docs minus null docs: "+(reader.maxDoc() - nulldocs)); return new BM25DocScorer(bm25stats, BVA); }
From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java
License:Open Source License
private void fillTermsOfInterest(LeafReader LeafReader, int luceneDoc, Map<Integer, TermInfo> termsOfInterest) throws IOException { // fill in terms of interest Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); if (postingsEnum != null) { postingsEnum.nextDoc();//from ww w. jav a 2 s .co m for (int i = 0, len = postingsEnum.freq(); i < len; i++) { int pos = postingsEnum.nextPosition(); if (termsOfInterest.containsKey(pos)) { termsOfInterest.put(pos, new TermInfo(termString, postingsEnum.startOffset(), postingsEnum.endOffset(), pos, 1)); } } } } else { break; } } }
From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java
License:Open Source License
private FlexibleQueue<CorpusTerm> runAllTermsWithDistributionsDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { FlexibleQueue<CorpusTerm> queue = new FlexibleQueue<CorpusTerm>(comparator, start + limit); LeafReader reader = corpusMapper.getLeafReader(); Map<String, Map<Integer, Integer>> rawFreqsMap = new HashMap<String, Map<Integer, Integer>>(); TermsEnum termsEnum = null;//from w w w . java 2 s. co m for (int doc : corpusMapper.getLuceneIds()) { Terms terms = reader.getTermVector(doc, tokenType.name()); if (terms != null) { termsEnum = terms.iterator(); if (termsEnum != null) { BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String term = bytesRef.utf8ToString(); if (!stopwords.isKeyword(term)) { if (!rawFreqsMap.containsKey(term)) { rawFreqsMap.put(term, new HashMap<Integer, Integer>()); } int rawF = (int) termsEnum.totalTermFreq(); if (rawF > minRawFreq) { rawFreqsMap.get(term).put(corpusMapper.getDocumentPositionFromLuceneId(doc), rawF); } } bytesRef = termsEnum.next(); } } } } int corpusSize = corpusMapper.getCorpus().size(); int[] tokensCounts = corpusMapper.getCorpus().getTokensCounts(tokenType); int totalCorpusTokens = corpusMapper.getCorpus().getTokensCount(tokenType); int bins = parameters.getParameterIntValue("bins", corpusSize); int[] documentRawFreqs; float[] documentRelativeFreqs; int documentPosition; int termFreq; int freq; for (Map.Entry<String, Map<Integer, Integer>> termsMap : rawFreqsMap.entrySet()) { String termString = termsMap.getKey(); documentRawFreqs = new int[corpusSize]; documentRelativeFreqs = new float[corpusSize]; termFreq = 0; for (Map.Entry<Integer, Integer> docsMap : termsMap.getValue().entrySet()) { documentPosition = docsMap.getKey(); freq = docsMap.getValue(); termFreq += freq; totalTokens += freq; documentRawFreqs[documentPosition] = freq; documentRelativeFreqs[documentPosition] = (float) freq / tokensCounts[documentPosition]; } //total++; if (termFreq > minRawFreq) { CorpusTerm corpusTerm = new CorpusTerm(termString, termFreq, totalCorpusTokens, termsMap.getValue().size(), corpusSize, documentRawFreqs, documentRelativeFreqs, bins); offer(queue, corpusTerm); } // queue.offer(new CorpusTerm(termString, termFreq, totalTokens, termsMap.getValue().size(), corpusSize, documentRawFreqs, documentRelativeFreqs, bins)); } return queue; }
From source file:org.voyanttools.trombone.tool.corpus.DocumentCollocates.java
License:Open Source License
private FlexibleQueue<DocumentCollocate> getCollocates(LeafReader LeafReader, int luceneDoc, int corpusDocIndex, int lastToken, List<DocumentSpansData> documentSpansData, Keywords stopwords) throws IOException { Map<Integer, TermInfo> termsOfInterest = getTermsOfInterest(LeafReader, luceneDoc, lastToken, documentSpansData, true);/* w ww . jav a2s .c o m*/ Map<String, Map<String, AtomicInteger>> mapOfTermsMap = new HashMap<String, Map<String, AtomicInteger>>(); Map<String, Integer> queryStringFrequencyMap = new HashMap<String, Integer>(); // this keeps track of the terms we want to lookup total document frequencies Map<String, Integer> stringsOfInterestMap = new HashMap<String, Integer>(); // Map<String, Map<String, Integer>> for (DocumentSpansData dsd : documentSpansData) { Map<String, AtomicInteger> termsMap = new HashMap<String, AtomicInteger>(); queryStringFrequencyMap.put(dsd.queryString, dsd.spansData.length); int contextTotalTokens = 0; for (int[] data : dsd.spansData) { int keywordstart = data[0]; int keywordend = data[1]; int leftstart = keywordstart - context; if (leftstart < 0) { leftstart = 0; } for (int i = leftstart; i < keywordstart - 1; i++) { contextTotalTokens++; String term = termsOfInterest.get(i).getText(); if (stopwords.isKeyword(term)) { continue; } if (collocatesWhitelist.isEmpty() == false && collocatesWhitelist.isKeyword(term) == false) { continue; } stringsOfInterestMap.put(term, 0); if (termsMap.containsKey(term)) { termsMap.get(term).getAndIncrement(); } else { termsMap.put(term, new AtomicInteger(1)); } } for (int i = keywordstart; i < keywordend; i++) { String term = termsOfInterest.get(i).getText(); if (stopwords.isKeyword(term)) { continue; } if (collocatesWhitelist.isEmpty() == false && collocatesWhitelist.isKeyword(term) == false) { continue; } stringsOfInterestMap.put(term, 0); } int rightend = keywordend + context; if (rightend > lastToken) { rightend = lastToken; } for (int i = keywordend; i < rightend; i++) { contextTotalTokens++; String term = termsOfInterest.get(i).getText(); if (stopwords.isKeyword(term)) { continue; } if (collocatesWhitelist.isEmpty() == false && collocatesWhitelist.isKeyword(term) == false) { continue; } stringsOfInterestMap.put(term, 0); if (termsMap.containsKey(term)) { termsMap.get(term).getAndIncrement(); } else { termsMap.put(term, new AtomicInteger(1)); } } } mapOfTermsMap.put(dsd.queryString, termsMap); } // gather document frequency for strings of interest int documentTotalTokens = 0; Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.FREQS); postingsEnum.nextDoc(); int freq = postingsEnum.freq(); documentTotalTokens += freq; if (stringsOfInterestMap.containsKey(termString)) { stringsOfInterestMap.put(termString, freq); } } else { break; } } FlexibleQueue<DocumentCollocate> documentCollocatesQueue = new FlexibleQueue(comparator, limit); for (Map.Entry<String, Map<String, AtomicInteger>> keywordMapEntry : mapOfTermsMap.entrySet()) { String keyword = keywordMapEntry.getKey(); int keywordContextRawFrequency = queryStringFrequencyMap.get(keyword); Map<String, AtomicInteger> termsMap = keywordMapEntry.getValue(); // once through to determine contextTotalTokens int contextTotalTokens = 0; for (Map.Entry<String, AtomicInteger> termsMapEntry : termsMap.entrySet()) { contextTotalTokens += termsMapEntry.getValue().intValue(); } /* * public DocumentCollocate(int corpusDocumentIndex, String keyword, String term, int keywordContextRawFrequency, int termContextRawFrequency, int termDocumentRawFrequency, int totalContextTokens, int totalDocumentTokens) { */ // and now to create document collocate objects for (Map.Entry<String, AtomicInteger> termsMapEntry : termsMap.entrySet()) { String term = termsMapEntry.getKey(); int termDocumentRawFrequency = stringsOfInterestMap.get(term); int termContextRawFrequency = termsMapEntry.getValue().intValue(); DocumentCollocate documentCollocate = new DocumentCollocate(corpusDocIndex, keyword, term, keywordContextRawFrequency, termContextRawFrequency, termDocumentRawFrequency, contextTotalTokens, documentTotalTokens); // DocumentCollocate documentCollocate = new DocumentCollocate(corpusDocIndex, keyword, term, contextTermRawFrequency, ((float) contextTermRawFrequency)/contextTotalTokens, documentTermRawFrequency, ((float) documentTermRawFrequency)/documentTotalTokens); documentCollocatesQueue.offer(documentCollocate); } } return documentCollocatesQueue; }
From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java
License:Open Source License
private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit); LeafReader reader = corpusMapper.getLeafReader(); Corpus corpus = corpusMapper.getCorpus(); CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType); TermsEnum termsEnum = null;/*from w w w .jav a 2s .com*/ Bits docIdBitSet = corpusMapper .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus)); Bits allBits = new Bits.MatchAllBits(reader.numDocs()); int[] tokenCounts = corpus.getTokensCounts(tokenType); float[] typesCountMeans = corpus.getTypesCountMeans(tokenType); float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType); for (int doc : corpusMapper.getLuceneIds()) { if (!docIdBitSet.get(doc)) { continue; } FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator, limit * docIdBitSet.length()); int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc); String docId = corpusMapper.getDocumentIdFromLuceneId(doc); float mean = typesCountMeans[documentPosition]; float stdDev = typesCountStdDev[documentPosition]; int totalTokensCount = tokenCounts[documentPosition]; Terms terms = reader.getTermVector(doc, tokenType.name()); if (terms != null) { termsEnum = terms.iterator(); if (termsEnum != null) { BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String termString = bytesRef.utf8ToString(); if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) { bytesRef = termsEnum.next(); continue; } if (!stopwords.isKeyword(termString)) { CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString); int[] positions = null; int[] offsets = null; int freq; if (isNeedsPositions || isNeedsOffsets) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); postingsEnum.nextDoc(); freq = postingsEnum.freq(); positions = new int[freq]; offsets = new int[freq]; for (int i = 0; i < freq; i++) { positions[i] = postingsEnum.nextPosition(); offsets[i] = postingsEnum.startOffset(); } } else { freq = (int) termsEnum.totalTermFreq(); } if (freq >= minRawFreq) { total++; float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN; DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString, freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal); docQueue.offer(documentTerm); } } bytesRef = termsEnum.next(); } } } int i = 0; for (DocumentTerm docTerm : docQueue.getOrderedList()) { queue.offer(docTerm); if (++i >= perDocLimit) { break; } } } corpusTermMinimalsDB.close(); this.terms.addAll(queue.getOrderedList(start)); }