Example usage for org.apache.lucene.util CharsRefBuilder toString

List of usage examples for org.apache.lucene.util CharsRefBuilder toString

Introduction

In this page you can find the example usage for org.apache.lucene.util CharsRefBuilder toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 *///from w  ww.  j a va 2 s  . c o m
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Print a term vector for debugging/*  w w w . j  ava2  s . co m*/
 * 
 * @param vector List of terms and their frequencies for a doc/field
 * @throws IOException 
 */
@SuppressWarnings("unused")
private void print(Terms vector) throws IOException {
    if (vector == null)
        return;
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    // termsEnum.docFreq() = 1, 
    // The returned Fields instance acts like a single-document inverted index
    HashMap<String, Long> map = new HashMap<String, Long>();
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        map.put(spare.toString(), termsEnum.totalTermFreq());
    }
    @SuppressWarnings("unchecked")
    Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]);
    Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() {
        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
            return o2.getValue().compareTo(o1.getValue());
        }
    });
    for (Map.Entry<String, Long> e : a) {
        System.out.print(e.getKey() + ":" + e.getValue() + " ");
    }
    System.out.println();
}

From source file:com.nlp.mlt.SimQuery.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 *//*from ww  w .  j a v  a 2 s  . c om*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();

        System.out.println(term + " = " + freq);

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader,
        IndexSchema schema) throws IOException {
    final CharsRefBuilder spare = new CharsRefBuilder();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
    for (Object o : doc.getFields()) {
        Field field = (Field) o;
        SimpleOrderedMap<Object> f = new SimpleOrderedMap<>();

        SchemaField sfield = schema.getFieldOrNull(field.name());
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        f.add("type", (ftype == null) ? null : ftype.getTypeName());
        f.add("schema", getFieldFlags(sfield));
        f.add("flags", getFieldFlags(field));

        Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());

        f.add("value", (ftype == null) ? null : ftype.toExternal(field));

        // TODO: this really should be "stored"
        f.add("internal", field.stringValue()); // may be a binary number

        BytesRef bytes = field.binaryValue();
        if (bytes != null) {
            f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
        }//from   www.  ja  va 2 s. c o  m
        f.add("boost", field.boost());
        f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this
        // can
        // be 0
        // for
        // non-indexed
        // fields

        // If we have a term vector, return that
        if (field.fieldType().storeTermVectors()) {
            try {
                Terms v = reader.getTermVector(docId, field.name());
                if (v != null) {
                    SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>();
                    final TermsEnum termsEnum = v.iterator();
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        final int freq = (int) termsEnum.totalTermFreq();
                        spare.copyUTF8Bytes(text);
                        tfv.add(spare.toString(), freq);
                    }
                    f.add("termVector", tfv);
                }
            } catch (Exception ex) {
                log.warn("error writing term vector", ex);
            }
        }

        finfo.add(field.name(), f);
    }
    return finfo;
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
        throws IOException {

    SolrParams params = req.getParams();
    final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

    TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
    // collect the top N
    // terms in./*  ww w  .ja v a2  s . c om*/

    final CharsRefBuilder spare = new CharsRefBuilder();

    Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field);
    if (terms == null) { // field does not exist
        return;
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int[] buckets = new int[HIST_ARRAY_SIZE];
    while ((text = termsEnum.next()) != null) {
        ++tiq.distinctTerms;
        int freq = termsEnum.docFreq(); // This calculation seems odd, but
        // it gives the same results as it
        // used to.
        int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
        buckets[slot] = buckets[slot] + 1;
        if (numTerms > 0 && freq > tiq.minFreq) {
            spare.copyUTF8Bytes(text);
            String t = spare.toString();

            tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
            if (tiq.size() > numTerms) { // if tiq full
                tiq.pop(); // remove lowest in tiq
                tiq.minFreq = tiq.getTopTermInfo().docFreq;
            }
        }
    }
    tiq.histogram.add(buckets);
    fieldMap.add("distinct", tiq.distinctTerms);

    // Include top terms
    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

    // Add a histogram
    fieldMap.add("histogram", tiq.histogram.toNamedList());
}

From source file:org.apache.solr.search.join.BlockJoinFieldFacetAccumulator.java

License:Apache License

/** copy paste from {@link DocValuesFacets} */
NamedList<Integer> getFacetValue() throws IOException {
    NamedList<Integer> facetValue = new NamedList<>();
    final CharsRefBuilder charsRef = new CharsRefBuilder(); // if there is no globs, take segment's ones
    for (int i = 1; i < (globalCounts != null ? globalCounts.length : segmentAccums.length); i++) {
        int count = globalCounts != null ? globalCounts[i] : (int) (segmentAccums[i] >> 32);
        if (count > 0) {
            BytesRef term = topSSDV.lookupOrd(-1 + i);
            fieldType.indexedToReadable(term, charsRef);
            facetValue.add(charsRef.toString(), count);
        }//from  ww  w  . j  av a2 s.  c  o m
    }
    return facetValue;
}

From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 *///from www.j  a  va 2  s .c  o  m
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName)
        throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java

License:Apache License

@Override
protected AllTermsSingleShardResponse shardOperation(AllTermsShardRequest request, ShardId shardId)
        throws ElasticsearchException {
    List<String> terms = new ArrayList<>();
    IndexService indexService = indicesService.indexServiceSafe(request.index());
    IndexShard indexShard = indexService.shardSafe(shardId.id());
    final Engine.Searcher searcher = indexShard.acquireSearcher("all_terms");
    IndexReader topLevelReader = searcher.reader();

    List<AtomicReaderContext> leaves = topLevelReader.leaves();

    try {//from www . j a va 2 s  .  c  om
        if (leaves.size() == 0) {
            return new AllTermsSingleShardResponse(terms);
        }
        List<TermsEnum> termIters = new ArrayList<>();

        try {
            for (AtomicReaderContext reader : leaves) {
                termIters.add(reader.reader().terms(request.field()).iterator(null));
            }
        } catch (IOException e) {
        }
        CharsRefBuilder spare = new CharsRefBuilder();
        BytesRef lastTerm = null;
        int[] exhausted = new int[termIters.size()];
        for (int i = 0; i < exhausted.length; i++) {
            exhausted[i] = 0;
        }
        try {
            //first find smallest term
            for (int i = 0; i < termIters.size(); i++) {
                BytesRef curTerm = null;
                if (request.from() != null) {
                    TermsEnum.SeekStatus seekStatus = termIters.get(i).seekCeil(new BytesRef(request.from()));
                    if (seekStatus.equals(TermsEnum.SeekStatus.END) == false) {
                        curTerm = termIters.get(i).term();
                    }
                } else {
                    curTerm = termIters.get(i).next();
                }

                if (lastTerm == null) {
                    lastTerm = curTerm;
                    if (lastTerm == null || lastTerm.length == 0) {
                        lastTerm = null;
                        exhausted[i] = 1;
                    }
                } else {
                    if (curTerm.compareTo(lastTerm) < 0) {
                        lastTerm = curTerm;
                    }
                }
            }
            if (lastTerm == null) {
                return new AllTermsSingleShardResponse(terms);
            }
            if (getDocFreq(termIters, lastTerm, request.field(), exhausted) >= request.minDocFreq()) {
                spare.copyUTF8Bytes(lastTerm);
                terms.add(spare.toString());
            }
            BytesRef blah = new BytesRef();
            blah.copyBytes(lastTerm);
            lastTerm = blah;

            while (terms.size() < request.size() && lastTerm != null) {
                moveIterators(exhausted, termIters, lastTerm, shardId);
                lastTerm = findMinimum(exhausted, termIters, shardId);

                if (lastTerm != null) {

                    if (getDocFreq(termIters, lastTerm, request.field(), exhausted) >= request.minDocFreq()) {
                        spare.copyUTF8Bytes(lastTerm);
                        terms.add(spare.toString());
                    }
                }
            }
        } catch (IOException e) {
        }

        logger.trace("[{}], final terms list: {}", shardId, terms);

        return new AllTermsSingleShardResponse(terms);
    } finally {
        searcher.close();
    }
}

From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java

License:Apache License

private long getDocFreq(List<TermsEnum> termIters, BytesRef lastTerm, String field, int[] exhausted) {
    long docFreq = 0;
    if (logger.isTraceEnabled()) {
        CharsRefBuilder b = new CharsRefBuilder();
        b.copyUTF8Bytes(lastTerm);//from  w  ww .ja va 2 s.  c o m
        logger.trace("Compute doc freq for {}", b.toString());
    }

    for (int i = 0; i < termIters.size(); i++) {
        if (exhausted[i] == 0) {
            try {
                if (logger.isTraceEnabled()) {
                    CharsRefBuilder b = new CharsRefBuilder();
                    b.copyUTF8Bytes(termIters.get(i).term());
                    logger.trace("Doc freq on seg {} for term {} is {}", i, b.toString(),
                            termIters.get(i).docFreq());
                }

                if (termIters.get(i).term().compareTo(lastTerm) == 0) {
                    docFreq += termIters.get(i).docFreq();
                }
            } catch (IOException e) {

            }
        }
    }
    return docFreq;
}

From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java

License:Apache License

private BytesRef findMinimum(int[] exhausted, List<TermsEnum> termIters, ShardId shardId) {
    BytesRef minTerm = null;// www . jav a 2  s  . com
    for (int i = 0; i < termIters.size(); i++) {
        if (exhausted[i] == 1) {
            continue;
        }
        BytesRef candidate = null;
        try {
            candidate = termIters.get(i).term();
        } catch (IOException e) {
        }
        if (minTerm == null) {
            minTerm = candidate;

        } else {
            //it is actually smaller, so we add it
            if (minTerm.compareTo(candidate) > 0) {
                minTerm = candidate;
                if (logger.isTraceEnabled()) {
                    CharsRefBuilder toiString = new CharsRefBuilder();
                    toiString.copyUTF8Bytes(minTerm);
                    logger.trace("{} Setting min to  {} from segment {}", shardId, toiString.toString(), i);
                }
            }
        }

    }
    if (minTerm != null) {
        if (logger.isTraceEnabled()) {
            CharsRefBuilder toiString = new CharsRefBuilder();
            toiString.copyUTF8Bytes(minTerm);
            logger.trace("{} final min term {}", shardId, toiString.toString());
        }
        BytesRef ret = new BytesRef();
        ret.copyBytes(minTerm);
        return ret;
    }
    return null;
}