Example usage for org.apache.lucene.index TermsEnum next

List of usage examples for org.apache.lucene.index TermsEnum next

Introduction

In this page you can find the example usage for org.apache.lucene.index TermsEnum next.

Prototype

BytesRef next() throws IOException;

Source Link

Document

Increments the iteration to the next BytesRef in the iterator.

Usage

From source file:SimpleNaiveBayesClassifier.java

License:Apache License

/**
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 *//*from   w  w  w.ja  v  a  2  s  .co  m*/
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument)
        throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

    Terms classes = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef next;
    String[] tokenizedText = tokenize(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = classesEnum.next()) != null) {
        if (next.length > 0) {
            Term term = new Term(this.classFieldName, next);
            double clVal = calculateLogPrior(term, docsWithClassSize)
                    + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
            assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
        }
    }

    // normalization; the values transforms to a 0-1 range
    return normClassificationResults(assignedClasses);
}

From source file:VocabDumper.java

void printWords() throws IOException {
    IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir));
    Fields fields = MultiFields.getFields(indexReader);

    String[] fieldNames = { MSIRDoc.FIELD_TITLE_EN, MSIRDoc.FIELD_EN, MSIRDoc.FIELD_TITLE_HN,
            MSIRDoc.FIELD_HN };//from   w  w w.  j  av a2s .  c  om

    for (String fieldName : fieldNames) {
        Terms terms = fields.terms(fieldName);
        TermsEnum iterator = terms.iterator(null);
        BytesRef byteRef = null;
        while ((byteRef = iterator.next()) != null) {
            String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
            if (term.indexOf('#') == -1)
                System.out.println(term);
        }
    }
}

From source file:SimpleNaiveBayesDocumentClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>();
    Map<String, Float> fieldName2boost = new LinkedHashMap<>();
    Terms classes = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef c;/* w  w w. j av  a2 s  . c  om*/

    analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost);

    int docsWithClassSize = countDocsWithClass();
    while ((c = classesEnum.next()) != null) {
        double classScore = 0;
        Term term = new Term(this.classFieldName, c);
        for (String fieldName : textFieldNames) {
            List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName);
            double fieldScore = 0;
            for (String[] fieldTokensArray : tokensArrays) {
                fieldScore += calculateLogPrior(term, docsWithClassSize)
                        + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize)
                                * fieldName2boost.get(fieldName);
            }
            classScore += fieldScore;
        }
        assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore));
    }
    return normClassificationResults(assignedClasses);
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 *//*from w  ww  .  j ava  2  s  .  co  m*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Print a term vector for debugging//from   w  ww  .  j av  a 2 s  .  c  o m
 * 
 * @param vector List of terms and their frequencies for a doc/field
 * @throws IOException 
 */
@SuppressWarnings("unused")
private void print(Terms vector) throws IOException {
    if (vector == null)
        return;
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    // termsEnum.docFreq() = 1, 
    // The returned Fields instance acts like a single-document inverted index
    HashMap<String, Long> map = new HashMap<String, Long>();
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        map.put(spare.toString(), termsEnum.totalTermFreq());
    }
    @SuppressWarnings("unchecked")
    Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]);
    Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() {
        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
            return o2.getValue().compareTo(o1.getValue());
        }
    });
    for (Map.Entry<String, Long> e : a) {
        System.out.print(e.getKey() + ":" + e.getValue() + " ");
    }
    System.out.println();
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap,
        String id, String field, double weight) throws IOException {
    Query query = new TermQuery(new Term("id", id));
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;/*from   w  ww . ja v a 2 s.c om*/
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                double minFreq = reader.numDocs() * 0.0001;
                double maxFreq = reader.numDocs() / 3;
                //double minFreq = 0;
                //double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id,
        String field, double weight) throws IOException {
    Query query = NumericRangeQuery.newLongRange("id", id, id, true, true);
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;/*from  w  w w.  j a v  a2  s.  c o m*/
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                //double minFreq = reader.numDocs() * 0.0001;
                //double maxFreq = reader.numDocs() / 3;
                double minFreq = 0;
                double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) {
    try {//from www.  java  2 s .c om
        int[] documentIds = getDocumentIds(d1Index);
        final Map<String, int[]> hashedBlocks = new HashMap<>();
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                // check whether it is a common term
                int d2DocFrequency = d2Index.docFreq(new Term(field, text));
                if (d2DocFrequency == 0) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                hashedBlocks.put(text.utf8ToString(), idsArray);
            }
        }
        return hashedBlocks;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) {
    try {// w  ww  .j  av  a2s  . c o m
        int[] documentIds = getDocumentIds(d2Index);
        Fields fields = MultiFields.getFields(d2Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (!hashedBlocks.containsKey(text.utf8ToString())) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                int[] d1Entities = hashedBlocks.get(text.utf8ToString());
                blocks.add(new BilateralBlock(d1Entities, idsArray));
            }
        }

    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseIndex(IndexReader d1Index) {
    try {//from  w  w w. ja  v a  2s  .  co  m
        int[] documentIds = getDocumentIds(d1Index);
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (termsEnum.docFreq() < 2) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                UnilateralBlock block = new UnilateralBlock(idsArray);
                blocks.add(block);
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}