Example usage for org.apache.lucene.util BytesRef deepCopyOf

List of usage examples for org.apache.lucene.util BytesRef deepCopyOf

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef deepCopyOf.

Prototype

public static BytesRef deepCopyOf(BytesRef other) 

Source Link

Document

Creates a new BytesRef that points to a copy of the bytes from other

The returned BytesRef will have a length of other.length and an offset of zero.

Usage

From source file:biospectra.classify.Classifier.java

License:Apache License

private void createNaiveKmerQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream,
        TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        builder.add(new TermQuery(t), BooleanClause.Occur.SHOULD);
    }/*from w w  w .j  a  v  a 2s .c o  m*/
}

From source file:biospectra.classify.Classifier.java

License:Apache License

private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field,
        CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt)
        throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for (int i = 0; i < 2; i++) {
        termArr[i] = null;/*  www.java  2  s .  com*/
        offsetArr[i] = 0;
    }

    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if (termArr[0] == null) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else if (termArr[1] == null) {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        } else {
            // shift
            termArr[0] = termArr[1];
            offsetArr[0] = offsetArr[1];
            // fill
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        }

        if (termArr[0] != null && termArr[1] != null) {
            long offsetDiff = offsetArr[1] - offsetArr[0];
            if (offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }
        }
    }
}

From source file:biospectra.classify.Classifier.java

License:Apache License

private void createPairedProximityQueryClauses(BooleanQuery.Builder builder, String field,
        CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt)
        throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for (int i = 0; i < 2; i++) {
        termArr[i] = null;//from  w  w w.jav a2s .c  o m
        offsetArr[i] = 0;
    }

    int count = 0;
    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if (count % 2 == 0) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();

            long offsetDiff = offsetArr[1] - offsetArr[0];
            if (offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }

            termArr[0] = null;
            termArr[1] = null;
        }

        count++;
    }

    if (termArr[0] != null) {
        builder.add(new TermQuery(termArr[0]), BooleanClause.Occur.SHOULD);
        termArr[0] = null;
    }
}

From source file:com.basho.yokozuna.handler.EntropyData.java

License:Open Source License

@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
        throws Exception, InstantiationException, IllegalAccessException {

    String contParam = req.getParams().get("continue");
    BytesRef cont = contParam != null ? decodeCont(contParam) : DEFAULT_CONT;

    // TODO: Make before required in handler config
    String before = req.getParams().get("before");
    if (before == null) {
        throw new Exception("Parameter 'before' is required");
    }//from  w  ww.j  a va  2 s .c  o  m
    int n = req.getParams().getInt("n", DEFAULT_N);
    SolrDocumentList docs = new SolrDocumentList();

    // Add docs here and modify object inline in code
    rsp.add("response", docs);

    try {
        SolrIndexSearcher searcher = req.getSearcher();
        AtomicReader rdr = searcher.getAtomicReader();
        BytesRef tmp = null;
        Terms terms = rdr.terms(ENTROPY_DATA_FIELD);
        TermsEnum te = terms.iterator(null);

        if (isContinue(cont)) {
            log.debug("continue from " + cont);

            TermsEnum.SeekStatus status = te.seekCeil(cont, true);

            if (status == TermsEnum.SeekStatus.END) {
                rsp.add("more", false);
                return;
            } else if (status == TermsEnum.SeekStatus.FOUND) {
                // If this term has already been seen then skip it.
                tmp = te.next();

                if (endOfItr(tmp)) {
                    rsp.add("more", false);
                    return;
                }
            } else if (status == TermsEnum.SeekStatus.NOT_FOUND) {
                tmp = te.next();
            }
        } else {
            tmp = te.next();
        }

        String text = null;
        String[] vals = null;
        String ts = null;
        String docId = null;
        String vectorClock = null;
        int count = 0;
        BytesRef current = null;

        while (!endOfItr(tmp) && count < n) {
            current = BytesRef.deepCopyOf(tmp);
            text = tmp.utf8ToString();
            log.debug("text: " + text);
            vals = text.split(" ");
            ts = vals[0];

            // TODO: what if null?
            if (!(ts.compareTo(before) < 0)) {
                rsp.add("more", false);
                docs.setNumFound(count);
                return;
            }

            docId = vals[1];
            vectorClock = vals[2];
            SolrDocument tmpDoc = new SolrDocument();
            tmpDoc.addField("doc_id", docId);
            tmpDoc.addField("base64_vclock", Base64.encodeBase64String(sha(vectorClock)));
            docs.add(tmpDoc);
            count++;
            tmp = te.next();
        }

        if (count < n) {
            rsp.add("more", false);
        } else {
            rsp.add("more", true);
            String newCont = Base64.encodeBase64URLSafeString(current.bytes);
            // The continue context for next req to start where
            // this one finished.
            rsp.add("continuation", newCont);
        }

        docs.setNumFound(count);

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java

License:Apache License

/**
 * This function is building the frame of the cache. The cache is storing the
 * word occurrences to the memory after those searched once. This cache can
 * made 2-100x speedup in proper use, but can eat lot of memory. There is an
 * option to lower the memory consume, if a word have really low occurrence in
 * the index you could filter it out. The other parameter is switching between
 * the term searching, if it true, just the terms in the skeleton will be
 * searched, but if it false the terms whoes not in the cache will be searched
 * out too (but not cached)./*from  w  w w . j  a va 2  s .c o  m*/
 *
 * @param minTermOccurrenceInCache Lower cache size with higher value.
 * @param justCachedTerms          The switch for fully exclude low occurrence docs.
 * @throws IOException If there is a low-level I/O error.
 */
public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException {
    this.justCachedTerms = justCachedTerms;

    this.docsWithClassSize = countDocsWithClass();
    termCClassHitCache.clear();
    cclasses.clear();
    classTermFreq.clear();

    // build the cache for the word
    Map<String, Long> frequencyMap = new HashMap<>();
    for (String textFieldName : textFieldNames) {
        TermsEnum termsEnum = leafReader.terms(textFieldName).iterator();
        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termText = term.utf8ToString();
            long frequency = termsEnum.docFreq();
            Long lastfreq = frequencyMap.get(termText);
            if (lastfreq != null)
                frequency += lastfreq;
            frequencyMap.put(termText, frequency);
        }
    }
    for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) {
        if (entry.getValue() > minTermOccurrenceInCache) {
            termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>());
        }
    }

    // fill the class list
    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    while ((termsEnum.next()) != null) {
        cclasses.add(BytesRef.deepCopyOf(termsEnum.term()));
    }
    // fill the classTermFreq map
    for (BytesRef cclass : cclasses) {
        double avgNumberOfUniqueTerms = 0;
        for (String textFieldName : textFieldNames) {
            terms = MultiFields.getTerms(leafReader, textFieldName);
            long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
            avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount();
        }
        int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass));
        classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC);
    }
}

From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument)
        throws IOException {
    if (leafReader == null) {
        throw new IOException("You must first call Classifier#train");
    }/*  ww w. j a  va2s .  com*/
    List<ClassificationResult<BytesRef>> dataList = new ArrayList<>();

    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    BytesRef next;
    String[] tokenizedDoc = tokenizeDoc(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    int count = 0;
    while ((next = termsEnum.next()) != null) {
        double clVal = calculateLogPrior(next, docsWithClassSize)
                + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize);
        dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal));
        count++;
    }
    // normalization; the values transforms to a 0-1 range
    ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    if (!dataList.isEmpty()) {
        Collections.sort(dataList);
        // this is a negative number closest to 0 = a
        double smax = dataList.get(0).getScore();

        double sumLog = 0;
        // log(sum(exp(x_n-a)))
        for (ClassificationResult<BytesRef> cr : dataList) {
            // getScore-smax <=0 (both negative, smax is the smallest abs()
            sumLog += Math.exp(cr.getScore() - smax);
        }
        // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
        double loga = smax;
        loga += Math.log(sumLog);

        // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
        for (ClassificationResult<BytesRef> cr : dataList) {
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(cr.getScore() - loga)));
        }
    }

    return returnList;
}

From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignRankClassNormalizedList(String inputDocument)
        throws IOException {
    if (leafReader == null) {
        throw new IOException("You must first call Classifier#train");
    }//  w w w. ja va 2 s .  c o m
    List<ClassificationResult<BytesRef>> dataList = new ArrayList<>();

    if (this.rankClassFieldNames == null || this.rankClassFieldNames.length == 0) {
        throw new IOException("rankClassField must defind");
    }

    for (String rankClassName : rankClassFieldNames) {

    }

    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    BytesRef next;
    String[] tokenizedDoc = tokenizeDoc(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = termsEnum.next()) != null) {
        double clVal = calculateLogPrior(next, docsWithClassSize)
                + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize);
        dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal));
    }

    // normalization; the values transforms to a 0-1 range
    ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>();
    if (!dataList.isEmpty()) {
        Collections.sort(dataList);
        // this is a negative number closest to 0 = a
        double smax = dataList.get(0).getScore();

        double sumLog = 0;
        // log(sum(exp(x_n-a)))
        for (ClassificationResult<BytesRef> cr : dataList) {
            // getScore-smax <=0 (both negative, smax is the smallest abs()
            sumLog += Math.exp(cr.getScore() - smax);
        }
        // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n)))
        double loga = smax;
        loga += Math.log(sumLog);

        // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum))
        for (ClassificationResult<BytesRef> cr : dataList) {
            returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(cr.getScore() - loga)));
        }
    }

    return returnList;
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text,
        PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException {
    OptionQueue collectionQueue = new OptionQueue(size);
    BytesRef ref;/*  ww w . j a  va 2  s  .  c o m*/
    while ((ref = prefixTermsEnum.next()) != null) {
        Term term = new Term(text.field(), BytesRef.deepCopyOf(ref));
        collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option(
                new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term)));
    }

    Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>(
            new StringText(text.text()), 0, text.bytes().length);
    while (collectionQueue.size() > 0) {
        entry.addOption(collectionQueue.pop());
    }
    return entry;
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private List<Term> queryTerms(SuggestionSearchContext.SuggestionContext suggestion, CharsRefBuilder spare)
        throws IOException {
    final String field = suggestion.getField();
    final List<Term> ret = new ArrayList<Term>();
    SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field,
            new SuggestUtils.TokenConsumer() {
                @Override/* w  ww  .j  a va2 s . c  o m*/
                public void nextToken() {
                    ret.add(new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder()))));
                }
            }, spare);
    return ret;
}

From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java

License:Apache License

private boolean setSavedStartTerm(BytesRef startTerm) {
    savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
    return true;
}