Example usage for org.apache.lucene.util BytesRef deepCopyOf

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef deepCopyOf.

Prototype

public static BytesRef deepCopyOf(BytesRef other)

Source Link

Document

Creates a new BytesRef that points to a copy of the bytes from other

The returned BytesRef will have a length of other.length and an offset of zero.

Usage

From source file:biospectra.classify.Classifier.java

License:Apache License

private void createNaiveKmerQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream,
        TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        builder.add(new TermQuery(t), BooleanClause.Occur.SHOULD);
    }/*from w w  w .j  a  v  a 2s .c o  m*/
}

From source file:biospectra.classify.Classifier.java

License:Apache License

private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field,
        CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt)
        throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for (int i = 0; i < 2; i++) {
        termArr[i] = null;/*  www.java  2  s .  com*/
        offsetArr[i] = 0;
    }

    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if (termArr[0] == null) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else if (termArr[1] == null) {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        } else {
            // shift
            termArr[0] = termArr[1];
            offsetArr[0] = offsetArr[1];
            // fill
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        }

        if (termArr[0] != null && termArr[1] != null) {
            long offsetDiff = offsetArr[1] - offsetArr[0];
            if (offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }
        }
    }
}

From source file:biospectra.classify.Classifier.java

License:Apache License

private void createPairedProximityQueryClauses(BooleanQuery.Builder builder, String field,
        CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt)
        throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for (int i = 0; i < 2; i++) {
        termArr[i] = null;//from  w  w w.jav a2s .c  o m
        offsetArr[i] = 0;
    }

    int count = 0;
    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if (count % 2 == 0) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();

            long offsetDiff = offsetArr[1] - offsetArr[0];
            if (offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }

            termArr[0] = null;
            termArr[1] = null;
        }

        count++;
    }

    if (termArr[0] != null) {
        builder.add(new TermQuery(termArr[0]), BooleanClause.Occur.SHOULD);
        termArr[0] = null;
    }
}

From source file:com.basho.yokozuna.handler.EntropyData.java

License:Open Source License

@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
        throws Exception, InstantiationException, IllegalAccessException {

    String contParam = req.getParams().get("continue");
    BytesRef cont = contParam != null ? decodeCont(contParam) : DEFAULT_CONT;

    // TODO: Make before required in handler config
    String before = req.getParams().get("before");
    if (before == null) {
        throw new Exception("Parameter 'before' is required");
    }//from  w  ww.j  a va  2 s .c  o  m
    int n = req.getParams().getInt("n", DEFAULT_N);
    SolrDocumentList docs = new SolrDocumentList();

    // Add docs here and modify object inline in code
    rsp.add("response", docs);

    try {
        SolrIndexSearcher searcher = req.getSearcher();
        AtomicReader rdr = searcher.getAtomicReader();
        BytesRef tmp = null;
        Terms terms = rdr.terms(ENTROPY_DATA_FIELD);
        TermsEnum te = terms.iterator(null);

        if (isContinue(cont)) {
            log.debug("continue from " + cont);

            TermsEnum.SeekStatus status = te.seekCeil(cont, true);

            if (status == TermsEnum.SeekStatus.END) {
                rsp.add("more", false);
                return;
            } else if (status == TermsEnum.SeekStatus.FOUND) {
                // If this term has already been seen then skip it.
                tmp = te.next();

                if (endOfItr(tmp)) {
                    rsp.add("more", false);
                    return;
                }
            } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
                tmp = te.next();
            }
        } else {
            tmp = te.next();
        }

        String text = null;
        String[] vals = null;
        String ts = null;
        String docId = null;
        String vectorClock = null;
        int count = 0;
        BytesRef current = null;

        while (!endOfItr(tmp) && count < n) {
            current = BytesRef.deepCopyOf(tmp);
            text = tmp.utf8ToString();
            log.debug("text: " + text);
            vals = text.split(" ");
            ts = vals[0];

            // TODO: what if null?
            if (!(ts.compareTo(before) < 0)) {
                rsp.add("more", false);
                docs.setNumFound(count);
                return;
            }

            docId = vals[1];
            vectorClock = vals[2];
            SolrDocument tmpDoc = new SolrDocument();
            tmpDoc.addField("doc_id", docId);
            tmpDoc.addField("base64_vclock", Base64.encodeBase64String(sha(vectorClock)));
            docs.add(tmpDoc);
            count++;
            tmp = te.next();
        }

        if (count < n) {
            rsp.add("more", false);
        } else {
            rsp.add("more", true);
            String newCont = Base64.encodeBase64URLSafeString(current.bytes);
            // The continue context for next req to start where
            // this one finished.
            rsp.add("continuation", newCont);
        }

        docs.setNumFound(count);

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java

License:Apache License

/**
 * This function is building the frame of the cache. The cache is storing the
 * word occurrences to the memory after those searched once. This cache can
 * made 2-100x speedup in proper use, but can eat lot of memory. There is an
 * option to lower the memory consume, if a word have really low occurrence in
 * the index you could filter it out. The other parameter is switching between
 * the term searching, if it true, just the terms in the skeleton will be
 * searched, but if it false the terms whoes not in the cache will be searched
 * out too (but not cached)./*from  w  w w . j  a va 2  s .c o  m*/
 *
 * @param minTermOccurrenceInCache Lower cache size with higher value.
 * @param justCachedTerms          The switch for fully exclude low occurrence docs.
 * @throws IOException If there is a low-level I/O error.
 */
public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException {
    this.justCachedTerms = justCachedTerms;

    this.docsWithClassSize = countDocsWithClass();
    termCClassHitCache.clear();
    cclasses.clear();
    classTermFreq.clear();

    // build the cache for the word
    Map<String, Long> frequencyMap = new HashMap<>();
    for (String textFieldName : textFieldNames) {
        TermsEnum termsEnum = leafReader.terms(textFieldName).iterator();
        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termText = term.utf8ToString();
            long frequency = termsEnum.docFreq();
            Long lastfreq = frequencyMap.get(termText);
            if (lastfreq != null)
                frequency += lastfreq;
            frequencyMap.put(termText, frequency);
        }
    }
    for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) {
        if (entry.getValue() > minTermOccurrenceInCache) {
            termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>());
        }
    }

    // fill the class list
    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    while ((termsEnum.next()) != null) {
        cclasses.add(BytesRef.deepCopyOf(termsEnum.term()));
    }
    // fill the classTermFreq map
    for (BytesRef cclass : cclasses) {
        double avgNumberOfUniqueTerms = 0;
        for (String textFieldName : textFieldNames) {
            terms = MultiFields.getTerms(leafReader, textFieldName);
            long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
            avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount();
        }
        int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass));
        classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC);
    }
}

From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument)
        throws IOException {
    if (leafReader == null) {
        throw new IOException("You must first call Classifier#train");
    }/*  ww w. j a  va2s .  com*/
    List<ClassificationResult<BytesRef>> dataList = new ArrayList<>();

    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    BytesRef next;
    String[] tokenizedDoc = tokenizeDoc(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    int count = 0;
    while ((next = termsEnum.next()) != null) {
        double clVal = calculateLogPrior(next, docsWithClassSize)
                + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize);
        dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal));
        count++;
    }
    // normalization; the values transforms to a 0-1 range
    ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    if (!dataList.isEmpty()) {
        Collections.sort(dataList);
        // this is a negative number closest to 0 = a
        double smax = dataList.get(0).getScore();

        double sumLog = 0;
        // log(sum(exp(x_n-a)))
        for (ClassificationResult<BytesRef> cr : dataList) {
            // getScore-smax <=0 (both negative, smax is the smallest abs()
            sumLog += Math.exp(cr.getScore() - smax);
        }
        // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
        double loga = smax;
        loga += Math.log(sumLog);

        // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
        for (ClassificationResult<BytesRef> cr : dataList) {
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(cr.getScore() - loga)));
        }
    }

    return returnList;
}

From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignRankClassNormalizedList(String inputDocument)
        throws IOException {
    if (leafReader == null) {
        throw new IOException("You must first call Classifier#train");
    }//  w w w. ja va 2 s .  c o m
    List<ClassificationResult<BytesRef>> dataList = new ArrayList<>();

    if (this.rankClassFieldNames == null || this.rankClassFieldNames.length == 0) {
        throw new IOException("rankClassField must defind");
    }

    for (String rankClassName : rankClassFieldNames) {

    }

    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    BytesRef next;
    String[] tokenizedDoc = tokenizeDoc(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = termsEnum.next()) != null) {
        double clVal = calculateLogPrior(next, docsWithClassSize)
                + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize);
        dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal));
    }

    // normalization; the values transforms to a 0-1 range
    ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    if (!dataList.isEmpty()) {
        Collections.sort(dataList);
        // this is a negative number closest to 0 = a
        double smax = dataList.get(0).getScore();

        double sumLog = 0;
        // log(sum(exp(x_n-a)))
        for (ClassificationResult<BytesRef> cr : dataList) {
            // getScore-smax <=0 (both negative, smax is the smallest abs()
            sumLog += Math.exp(cr.getScore() - smax);
        }
        // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
        double loga = smax;
        loga += Math.log(sumLog);

        // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
        for (ClassificationResult<BytesRef> cr : dataList) {
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(cr.getScore() - loga)));
        }
    }

    return returnList;
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text,
        PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException {
    OptionQueue collectionQueue = new OptionQueue(size);
    BytesRef ref;/*  ww w . j a  va 2  s  .  c o m*/
    while ((ref = prefixTermsEnum.next()) != null) {
        Term term = new Term(text.field(), BytesRef.deepCopyOf(ref));
        collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option(
                new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term)));
    }

    Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>(
            new StringText(text.text()), 0, text.bytes().length);
    while (collectionQueue.size() > 0) {
        entry.addOption(collectionQueue.pop());
    }
    return entry;
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private List<Term> queryTerms(SuggestionSearchContext.SuggestionContext suggestion, CharsRefBuilder spare)
        throws IOException {
    final String field = suggestion.getField();
    final List<Term> ret = new ArrayList<Term>();
    SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field,
            new SuggestUtils.TokenConsumer() {
                @Override/* w  ww  .j  a va2 s . c  o m*/
                public void nextToken() {
                    ret.add(new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder()))));
                }
            }, spare);
    return ret;
}

From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java

License:Apache License

private boolean setSavedStartTerm(BytesRef startTerm) {
    savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
    return true;
}