List of usage examples for org.apache.lucene.index TermsEnum next
BytesRef next() throws IOException;
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Calculate probabilities for all classes for a given input text * @param inputDocument the input text as a {@code String} * @return a {@code List} of {@code ClassificationResult}, one for each existing class * @throws IOException if assigning probabilities fails *//*from w w w.ja v a 2 s .co m*/ protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef next; String[] tokenizedText = tokenize(inputDocument); int docsWithClassSize = countDocsWithClass(); while ((next = classesEnum.next()) != null) { if (next.length > 0) { Term term = new Term(this.classFieldName, next); double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize); assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal)); } } // normalization; the values transforms to a 0-1 range return normClassificationResults(assignedClasses); }
From source file:VocabDumper.java
void printWords() throws IOException { IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir)); Fields fields = MultiFields.getFields(indexReader); String[] fieldNames = { MSIRDoc.FIELD_TITLE_EN, MSIRDoc.FIELD_EN, MSIRDoc.FIELD_TITLE_HN, MSIRDoc.FIELD_HN };//from w w w. j av a2s . c om for (String fieldName : fieldNames) { Terms terms = fields.terms(fieldName); TermsEnum iterator = terms.iterator(null); BytesRef byteRef = null; while ((byteRef = iterator.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); if (term.indexOf('#') == -1) System.out.println(term); } } }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>(); Map<String, Float> fieldName2boost = new LinkedHashMap<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef c;/* w w w. j av a2 s . c om*/ analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost); int docsWithClassSize = countDocsWithClass(); while ((c = classesEnum.next()) != null) { double classScore = 0; Term term = new Term(this.classFieldName, c); for (String fieldName : textFieldNames) { List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName); double fieldScore = 0; for (String[] fieldTokensArray : tokensArrays) { fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName); } classScore += fieldScore; } assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore)); } return normClassificationResults(assignedClasses); }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field *//*from w ww . j ava 2 s . co m*/ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Print a term vector for debugging//from w ww . j av a 2 s . c o m * * @param vector List of terms and their frequencies for a doc/field * @throws IOException */ @SuppressWarnings("unused") private void print(Terms vector) throws IOException { if (vector == null) return; final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; // termsEnum.docFreq() = 1, // The returned Fields instance acts like a single-document inverted index HashMap<String, Long> map = new HashMap<String, Long>(); while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); map.put(spare.toString(), termsEnum.totalTermFreq()); } @SuppressWarnings("unchecked") Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]); Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() { public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Map.Entry<String, Long> e : a) { System.out.print(e.getKey() + ":" + e.getValue() + " "); } System.out.println(); }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, String id, String field, double weight) throws IOException { Query query = new TermQuery(new Term("id", id)); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum; termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text;/*from w ww . ja v a 2 s.c om*/ while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms double minFreq = reader.numDocs() * 0.0001; double maxFreq = reader.numDocs() / 3; //double minFreq = 0; //double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id, String field, double weight) throws IOException { Query query = NumericRangeQuery.newLongRange("id", id, id, true, true); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum; termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text;/*from w w w. j a v a2 s. c o m*/ while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms //double minFreq = reader.numDocs() * 0.0001; //double maxFreq = reader.numDocs() / 3; double minFreq = 0; double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { try {//from www. java 2 s .c om int[] documentIds = getDocumentIds(d1Index); final Map<String, int[]> hashedBlocks = new HashMap<>(); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // check whether it is a common term int d2DocFrequency = d2Index.docFreq(new Term(field, text)); if (d2DocFrequency == 0) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); hashedBlocks.put(text.utf8ToString(), idsArray); } } return hashedBlocks; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) { try {// w ww .j av a2s . c o m int[] documentIds = getDocumentIds(d2Index); Fields fields = MultiFields.getFields(d2Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (!hashedBlocks.containsKey(text.utf8ToString())) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); int[] d1Entities = hashedBlocks.get(text.utf8ToString()); blocks.add(new BilateralBlock(d1Entities, idsArray)); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseIndex(IndexReader d1Index) { try {//from w w w. ja v a 2s . co m int[] documentIds = getDocumentIds(d1Index); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (termsEnum.docFreq() < 2) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock block = new UnilateralBlock(idsArray); blocks.add(block); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }