Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java

License:Apache License

public LongFilter convertToDoubleFilter() {
    if (isPartitionBased()) {
        return new PartitionedLongFilter();
    }/*from  ww w .  j a v a2  s  . c  om*/

    int numValids = includeValues == null ? 0 : includeValues.size();
    int numInvalids = excludeValues == null ? 0 : excludeValues.size();
    SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids);
    if (includeValues != null) {
        for (BytesRef val : includeValues) {
            double dval = Double.parseDouble(val.utf8ToString());
            result.addAccept(NumericUtils.doubleToSortableLong(dval));
        }
    }
    if (excludeValues != null) {
        for (BytesRef val : excludeValues) {
            double dval = Double.parseDouble(val.utf8ToString());
            result.addReject(NumericUtils.doubleToSortableLong(dval));
        }
    }
    return result;
}

From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java

License:Apache License

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
    if (include != null) {
        builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString());
    } else if (includeValues != null) {
        builder.startArray(INCLUDE_FIELD.getPreferredName());
        for (BytesRef value : includeValues) {
            builder.value(value.utf8ToString());
        }/*from  ww w  .  j a  va2  s.  co m*/
        builder.endArray();
    } else if (isPartitionBased()) {
        builder.startObject(INCLUDE_FIELD.getPreferredName());
        builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition);
        builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions);
        builder.endObject();
    }
    if (exclude != null) {
        builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString());
    } else if (excludeValues != null) {
        builder.startArray(EXCLUDE_FIELD.getPreferredName());
        for (BytesRef value : excludeValues) {
            builder.value(value.utf8ToString());
        }
        builder.endArray();
    }
    return builder;
}

From source file:org.codelibs.fess.helper.QueryHelper.java

License:Apache License

protected QueryBuilder convertTermRangeQuery(final QueryContext context, final TermRangeQuery termRangeQuery,
        final float boost) {
    final String field = getSearchField(context, termRangeQuery.getField());
    if (isSearchField(field)) {
        context.addFieldLog(field, termRangeQuery.toString(field));
        final RangeQueryBuilder rangeQuery = QueryBuilders.rangeQuery(field);
        final BytesRef min = termRangeQuery.getLowerTerm();
        if (min != null) {
            if (termRangeQuery.includesLower()) {
                rangeQuery.gte(min.utf8ToString());
            } else {
                rangeQuery.gt(min.utf8ToString());
            }/*w  w w  . j  av  a2 s.co  m*/
        }
        final BytesRef max = termRangeQuery.getUpperTerm();
        if (max != null) {
            if (termRangeQuery.includesUpper()) {
                rangeQuery.lte(max.utf8ToString());
            } else {
                rangeQuery.lt(max.utf8ToString());
            }
        }
        rangeQuery.boost(boost);
        return rangeQuery;
    } else {
        final String origQuery = termRangeQuery.toString();
        context.addFieldLog(Constants.DEFAULT_FIELD, origQuery);
        context.addHighlightedQuery(origQuery);
        return buildDefaultQueryBuilder((f, b) -> QueryBuilders.matchPhraseQuery(f, origQuery).boost(b));
    }
}

From source file:org.deshang.content.indexing.scheduling.ContentIndexingTask.java

License:Apache License

private void calcPersonTermDocFreqInfo(TermDocFreqStatistics statistics, IndexReader reader)
        throws IOException {
    long docNum = reader.numDocs();
    LOGGER.debug("Total number of documents is " + docNum + ".");
    List<AtomicReaderContext> atomicCtxList = reader.leaves();
    for (AtomicReaderContext ctx : atomicCtxList) {
        FilterAtomicReader far = new FilterAtomicReader(ctx.reader());
        for (String field : far.fields()) {
            Terms terms = far.fields().terms(field);
            LOGGER.debug("Reader [" + far.toString() + "] totally has " + terms.size() + " term(s).");
            TermsEnum termsEnum = terms.iterator(null);
            BytesRef term = null;
            while ((term = termsEnum.next()) != null) {
                String termUtf8String = term.utf8ToString();
                int existPersonDocFreq = statistics.getTermPersonDocFreq(termUtf8String);
                int personDocFreq = far.docFreq(new Term(field, term));
                double personDocFreqPercent = ((double) personDocFreq) / docNum;
                if (existPersonDocFreq < 0) {
                    personDocFreq += statistics.getTermPersonDocFreq(termUtf8String);
                    personDocFreqPercent += statistics.getTermPersonDocFreqPercent(termUtf8String);
                }//ww w  . j  a v  a 2s  .  c o m
                statistics.putTermPersonDocFreqInfo(termUtf8String, personDocFreq, personDocFreqPercent);
            }
        }
        far.close();
    }
}

From source file:org.dkpro.tc.features.ngram.LuceneNgramDocumentTest.java

License:Apache License

private Set<String> getTokensFromIndex(File luceneFolder) throws Exception {
    Set<String> token = new HashSet<>();
    @SuppressWarnings("deprecation")
    IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder));
    Fields fields = MultiFields.getFields(idxReader);
    for (String field : fields) {
        if (field.equals("id")) {
            continue;
        }//from  w w  w . ja v a 2  s . c om
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            token.add(text.utf8ToString());
        }
    }
    return token;
}

From source file:org.dkpro.tc.features.ngram.meta.LuceneCharacterNGramMetaCollectorTest.java

License:Apache License

@SuppressWarnings("unused")
@Test/*from   w w w .  ja  v  a2s.c  om*/
public void luceneCharacterNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "charMetaCollectorTest.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneCharacterNGramMetaCollector.class, LuceneCharacterNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123",
            LuceneCharacterNGram.PARAM_NGRAM_MIN_N, 2, LuceneCharacterNGramMetaCollector.PARAM_TARGET_LOCATION,
            tmpDir, LuceneCharacterNGram.PARAM_SOURCE_LOCATION, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        //            System.out.println(jcas.getDocumentText().length());
    }

    Set<String> freq2terms = new HashSet<>();

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneCharacterNGramMetaCollector.LUCENE_CHAR_NGRAM_FIELD + "123");
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (termsEnum.totalTermFreq() == 2) {
                        freq2terms.add(text.utf8ToString());
                    }
                    //                        System.out.println(text.utf8ToString() + " " + termsEnum.totalTermFreq());
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(10, i);
    assertEquals(1, freq2terms.size());
}

From source file:org.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java

License:Apache License

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "text*.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir,
            LuceneNGramMetaCollector.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME);

    for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) {
        System.out.println(jcas.getDocumentText().length());
    }//  w  w  w. j av a2 s .  c om

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
}

From source file:org.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java

License:Apache License

@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123",
            LuceneNGramCPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION,
            tmpDir);//from w  w  w .  j a  va  2 s. c  o m

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    // if there were multiple instances of the same ngram,
                    // then this would be relevant
                    if (text.utf8ToString().equals("mice_ANDcats_.")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(1, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(65, i);
}

From source file:org.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java

License:Apache License

@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class,
            DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123",
            LuceneNGramPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION,
            tmpDir);// w w w. jav a 2 s  . com

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(16, i);
}

From source file:org.elasticsearch.action.termlist.TransportTermlistAction.java

License:Apache License

@Override
protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException {
    synchronized (termlistMutex) {
        InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index())
                .shardSafe(request.shardId());
        indexShard.store().directory();/*ww w .ja  va2  s  .com*/
        Engine.Searcher searcher = indexShard.searcher();
        try {
            Set<String> set = new CompactHashSet();

            Fields fields = MultiFields.getFields(searcher.reader());
            if (fields != null) {
                for (Iterator<String> it = fields.iterator(); it.hasNext();) {
                    String field = it.next();
                    if (field.charAt(0) == '_') {
                        continue;
                    }
                    if (request.getField() == null || field.equals(request.getField())) {
                        Terms terms = fields.terms(field);
                        if (terms != null) {
                            TermsEnum termsEnum = terms.iterator(null);
                            BytesRef text;
                            while ((text = termsEnum.next()) != null) {
                                set.add(text.utf8ToString());
                                System.out.println("field=" + field + "; text=" + text.utf8ToString());
                            }
                        }
                    }
                }
            }
            return new ShardTermlistResponse(request.index(), request.shardId(), set);
        } catch (IOException ex) {
            throw new ElasticSearchException(ex.getMessage(), ex);
        }
    }
}