Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:org.apache.solr.util.NumberUtils.java

License:Apache License

public static String SortableStr2long(BytesRef val) {
    // TODO: operate directly on BytesRef
    return SortableStr2long(val.utf8ToString());
}

From source file:org.apache.solr.util.NumberUtils.java

License:Apache License

public static float SortableStr2float(BytesRef val) {
    // TODO: operate directly on BytesRef
    return SortableStr2float(val.utf8ToString());
}

From source file:org.apache.solr.util.NumberUtils.java

License:Apache License

public static double SortableStr2double(BytesRef val) {
    // TODO: operate directly on BytesRef
    return SortableStr2double(val.utf8ToString());
}

From source file:org.apache.solr.util.NumberUtils.java

License:Apache License

public static int SortableStr2int(BytesRef sval, int offset, int len) {
    // TODO: operate directly on BytesRef
    return SortableStr2int(sval.utf8ToString(), offset, len);
}

From source file:org.apache.solr.util.NumberUtils.java

License:Apache License

public static long SortableStr2long(BytesRef sval, int offset, int len) {
    // TODO: operate directly on BytesRef
    return SortableStr2long(sval.utf8ToString(), offset, len);
}

From source file:org.apache.tika.eval.tokens.LuceneTokenCounter.java

License:Apache License

void count(String field) throws IOException {
    long tokenCount = leafReader.getSumTotalTermFreq(field);
    if (tokenCount > Integer.MAX_VALUE) {
        throw new IllegalArgumentException("can't handle longs");
    }//from ww w .  j  ava  2  s .c  om
    int tokenCountInt = (int) tokenCount;
    int uniqueTokenCount = 0;
    SummaryStatistics summStats = new SummaryStatistics();
    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    Terms terms = leafReader.terms(field);
    if (terms == null) {
        //if there were no terms
        fieldStats.put(field,
                new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats));
        return;

    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    while (bytesRef != null) {

        long termFreq = termsEnum.totalTermFreq();
        if (termFreq > Integer.MAX_VALUE) {
            throw new IllegalArgumentException("Sorry can't handle longs yet");
        }
        int tf = (int) termFreq;
        //TODO: figure out how to avoid Stringifying this
        //to get codepoint count
        String t = bytesRef.utf8ToString();
        int len = t.codePointCount(0, t.length());
        for (int i = 0; i < tf; i++) {
            summStats.addValue(len);
        }
        p = (double) tf / (double) tokenCount;
        ent += p * FastMath.log(base, p);

        if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(t, tf));
        }

        uniqueTokenCount++;
        bytesRef = termsEnum.next();
    }
    if (tokenCountInt > 0) {
        ent = (-1.0d / (double) tokenCountInt) * ent;
    }

    fieldStats.put(field,
            new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats));
}

From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java

License:Apache License

private void execute(Path inputFile, Path commonTokensFile) throws Exception {
    Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
    AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
    try {//from   ww  w  .  ja  v a  2s.  c  o m
        Directory directory = FSDirectory.open(luceneDir);
        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

        Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        int maxLen = 1000000;
        int len = 0;
        try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
            List<Document> docs = new ArrayList<>();
            try (BufferedReader reader = getReader(inputFile)) {
                String line = reader.readLine();
                while (line != null) {
                    len += line.length();
                    Document document = new Document();
                    document.add(new TextField(FIELD, line, Field.Store.NO));
                    docs.add(document);
                    if (len > maxLen) {
                        writer.addDocuments(docs);
                        docs.clear();
                        len = 0;
                    }
                    line = reader.readLine();
                }
            }
            if (docs.size() > 0) {
                writer.addDocuments(docs);
            }
            writer.commit();
            writer.flush();
        }
        try (IndexReader reader = DirectoryReader.open(directory)) {
            LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
            Terms terms = wrappedReader.terms(FIELD);
            TermsEnum termsEnum = terms.iterator();
            BytesRef bytesRef = termsEnum.next();
            int docsWThisField = wrappedReader.getDocCount(FIELD);
            while (bytesRef != null) {
                int df = termsEnum.docFreq();
                long tf = termsEnum.totalTermFreq();
                if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                    bytesRef = termsEnum.next();
                    continue;
                }

                if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                    String t = bytesRef.utf8ToString();
                    if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) {
                        queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                    }

                }
                bytesRef = termsEnum.next();
            }
        }
    } finally {
        FileUtils.deleteDirectory(luceneDir.toFile());
    }

    writeTopN(commonTokensFile, queue);

}

From source file:org.buzzinate.lezhi.query.LezhiTermsEnum.java

License:Apache License

protected AcceptStatus accept(BytesRef term) throws IOException {
    System.out.println(term.utf8ToString() + ", docfreq=" + docFreq());
    if (StringHelper.startsWith(term, prefixRef)) {
        return AcceptStatus.YES;
    } else {/* w w  w  .ja v a 2s .  c om*/
        return AcceptStatus.END;
    }
}

From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java

License:Apache License

private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUserFormattedValues,
        DocValueFormat format) {//w w w.  ja v a 2 s  .  c o m
    SortedSet<BytesRef> result = endUserFormattedValues;
    if (endUserFormattedValues != null) {
        if (format != DocValueFormat.RAW) {
            result = new TreeSet<>();
            for (BytesRef formattedVal : endUserFormattedValues) {
                result.add(format.parseBytesRef(formattedVal.utf8ToString()));
            }
        }
    }
    return result;
}

From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java

License:Apache License

public LongFilter convertToLongFilter(DocValueFormat format) {

    if (isPartitionBased()) {
        return new PartitionedLongFilter();
    }//  www . ja v a2 s . c om

    int numValids = includeValues == null ? 0 : includeValues.size();
    int numInvalids = excludeValues == null ? 0 : excludeValues.size();
    SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids);
    if (includeValues != null) {
        for (BytesRef val : includeValues) {
            result.addAccept(format.parseLong(val.utf8ToString(), false, null));
        }
    }
    if (excludeValues != null) {
        for (BytesRef val : excludeValues) {
            result.addReject(format.parseLong(val.utf8ToString(), false, null));
        }
    }
    return result;
}