Example usage for org.apache.lucene.util CharsRefBuilder copyUTF8Bytes

List of usage examples for org.apache.lucene.util CharsRefBuilder copyUTF8Bytes

Introduction

In this page you can find the example usage for org.apache.lucene.util CharsRefBuilder copyUTF8Bytes.

Prototype

public void copyUTF8Bytes(BytesRef bytes) 

Source Link

Document

Copy the provided bytes, interpreted as UTF-8 bytes.

Usage

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 *//*w  w  w  .  j av a2 s . c  o  m*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * Print a term vector for debugging//from  ww  w. j  a  v a 2 s. c om
 * 
 * @param vector List of terms and their frequencies for a doc/field
 * @throws IOException 
 */
@SuppressWarnings("unused")
private void print(Terms vector) throws IOException {
    if (vector == null)
        return;
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    // termsEnum.docFreq() = 1, 
    // The returned Fields instance acts like a single-document inverted index
    HashMap<String, Long> map = new HashMap<String, Long>();
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        map.put(spare.toString(), termsEnum.totalTermFreq());
    }
    @SuppressWarnings("unchecked")
    Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]);
    Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() {
        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
            return o2.getValue().compareTo(o1.getValue());
        }
    });
    for (Map.Entry<String, Long> e : a) {
        System.out.print(e.getKey() + ":" + e.getValue() + " ");
    }
    System.out.println();
}

From source file:com.nlp.mlt.SimQuery.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 *///  w w  w.j  ava2 s  . c om
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        final int freq = (int) termsEnum.totalTermFreq();

        System.out.println(term + " = " + freq);

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader,
        IndexSchema schema) throws IOException {
    final CharsRefBuilder spare = new CharsRefBuilder();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();
    for (Object o : doc.getFields()) {
        Field field = (Field) o;
        SimpleOrderedMap<Object> f = new SimpleOrderedMap<>();

        SchemaField sfield = schema.getFieldOrNull(field.name());
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        f.add("type", (ftype == null) ? null : ftype.getTypeName());
        f.add("schema", getFieldFlags(sfield));
        f.add("flags", getFieldFlags(field));

        Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());

        f.add("value", (ftype == null) ? null : ftype.toExternal(field));

        // TODO: this really should be "stored"
        f.add("internal", field.stringValue()); // may be a binary number

        BytesRef bytes = field.binaryValue();
        if (bytes != null) {
            f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
        }// w  ww  .j a  v  a  2s.  com
        f.add("boost", field.boost());
        f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this
        // can
        // be 0
        // for
        // non-indexed
        // fields

        // If we have a term vector, return that
        if (field.fieldType().storeTermVectors()) {
            try {
                Terms v = reader.getTermVector(docId, field.name());
                if (v != null) {
                    SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>();
                    final TermsEnum termsEnum = v.iterator();
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        final int freq = (int) termsEnum.totalTermFreq();
                        spare.copyUTF8Bytes(text);
                        tfv.add(spare.toString(), freq);
                    }
                    f.add("termVector", tfv);
                }
            } catch (Exception ex) {
                log.warn("error writing term vector", ex);
            }
        }

        finfo.add(field.name(), f);
    }
    return finfo;
}

From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java

License:Open Source License

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
        throws IOException {

    SolrParams params = req.getParams();
    final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

    TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to
    // collect the top N
    // terms in./* ww w  . j  a va2  s  . c  om*/

    final CharsRefBuilder spare = new CharsRefBuilder();

    Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field);
    if (terms == null) { // field does not exist
        return;
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int[] buckets = new int[HIST_ARRAY_SIZE];
    while ((text = termsEnum.next()) != null) {
        ++tiq.distinctTerms;
        int freq = termsEnum.docFreq(); // This calculation seems odd, but
        // it gives the same results as it
        // used to.
        int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
        buckets[slot] = buckets[slot] + 1;
        if (numTerms > 0 && freq > tiq.minFreq) {
            spare.copyUTF8Bytes(text);
            String t = spare.toString();

            tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
            if (tiq.size() > numTerms) { // if tiq full
                tiq.pop(); // remove lowest in tiq
                tiq.minFreq = tiq.getTopTermInfo().docFreq;
            }
        }
    }
    tiq.histogram.add(buckets);
    fieldMap.add("distinct", tiq.distinctTerms);

    // Include top terms
    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

    // Add a histogram
    fieldMap.add("histogram", tiq.histogram.toNamedList());
}

From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java

License:Apache License

/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 *//*  w  w w  .  jav a 2 s . co  m*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName)
        throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}

From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java

License:Apache License

public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer,
        CharsRefBuilder spare) throws IOException {
    spare.copyUTF8Bytes(toAnalyze);
    CharsRef charsRef = spare.get();/*from   ww  w .j a v  a 2 s .  c  o m*/
    try (TokenStream ts = analyzer.tokenStream(field,
            new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
        return analyze(ts, consumer);
    }
}

From source file:org.codelibs.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java

License:Apache License

public TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field)
        throws IOException {
    spare.copyUTF8Bytes(query);
    return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars(), 0, spare.length()));
}

From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java

License:Apache License

@Override
protected AllTermsSingleShardResponse shardOperation(AllTermsShardRequest request, ShardId shardId)
        throws ElasticsearchException {
    List<String> terms = new ArrayList<>();
    IndexService indexService = indicesService.indexServiceSafe(request.index());
    IndexShard indexShard = indexService.shardSafe(shardId.id());
    final Engine.Searcher searcher = indexShard.acquireSearcher("all_terms");
    IndexReader topLevelReader = searcher.reader();

    List<AtomicReaderContext> leaves = topLevelReader.leaves();

    try {/*from  w  w w  .  j  av  a  2s  . c  o m*/
        if (leaves.size() == 0) {
            return new AllTermsSingleShardResponse(terms);
        }
        List<TermsEnum> termIters = new ArrayList<>();

        try {
            for (AtomicReaderContext reader : leaves) {
                termIters.add(reader.reader().terms(request.field()).iterator(null));
            }
        } catch (IOException e) {
        }
        CharsRefBuilder spare = new CharsRefBuilder();
        BytesRef lastTerm = null;
        int[] exhausted = new int[termIters.size()];
        for (int i = 0; i < exhausted.length; i++) {
            exhausted[i] = 0;
        }
        try {
            //first find smallest term
            for (int i = 0; i < termIters.size(); i++) {
                BytesRef curTerm = null;
                if (request.from() != null) {
                    TermsEnum.SeekStatus seekStatus = termIters.get(i).seekCeil(new BytesRef(request.from()));
                    if (seekStatus.equals(TermsEnum.SeekStatus.END) == false) {
                        curTerm = termIters.get(i).term();
                    }
                } else {
                    curTerm = termIters.get(i).next();
                }

                if (lastTerm == null) {
                    lastTerm = curTerm;
                    if (lastTerm == null || lastTerm.length == 0) {
                        lastTerm = null;
                        exhausted[i] = 1;
                    }
                } else {
                    if (curTerm.compareTo(lastTerm) < 0) {
                        lastTerm = curTerm;
                    }
                }
            }
            if (lastTerm == null) {
                return new AllTermsSingleShardResponse(terms);
            }
            if (getDocFreq(termIters, lastTerm, request.field(), exhausted) >= request.minDocFreq()) {
                spare.copyUTF8Bytes(lastTerm);
                terms.add(spare.toString());
            }
            BytesRef blah = new BytesRef();
            blah.copyBytes(lastTerm);
            lastTerm = blah;

            while (terms.size() < request.size() && lastTerm != null) {
                moveIterators(exhausted, termIters, lastTerm, shardId);
                lastTerm = findMinimum(exhausted, termIters, shardId);

                if (lastTerm != null) {

                    if (getDocFreq(termIters, lastTerm, request.field(), exhausted) >= request.minDocFreq()) {
                        spare.copyUTF8Bytes(lastTerm);
                        terms.add(spare.toString());
                    }
                }
            }
        } catch (IOException e) {
        }

        logger.trace("[{}], final terms list: {}", shardId, terms);

        return new AllTermsSingleShardResponse(terms);
    } finally {
        searcher.close();
    }
}

From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java

License:Apache License

private long getDocFreq(List<TermsEnum> termIters, BytesRef lastTerm, String field, int[] exhausted) {
    long docFreq = 0;
    if (logger.isTraceEnabled()) {
        CharsRefBuilder b = new CharsRefBuilder();
        b.copyUTF8Bytes(lastTerm);
        logger.trace("Compute doc freq for {}", b.toString());
    }//from  www  . j a v a 2  s.com

    for (int i = 0; i < termIters.size(); i++) {
        if (exhausted[i] == 0) {
            try {
                if (logger.isTraceEnabled()) {
                    CharsRefBuilder b = new CharsRefBuilder();
                    b.copyUTF8Bytes(termIters.get(i).term());
                    logger.trace("Doc freq on seg {} for term {} is {}", i, b.toString(),
                            termIters.get(i).docFreq());
                }

                if (termIters.get(i).term().compareTo(lastTerm) == 0) {
                    docFreq += termIters.get(i).docFreq();
                }
            } catch (IOException e) {

            }
        }
    }
    return docFreq;
}