List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java
License:Apache License
public LongFilter convertToDoubleFilter() { if (isPartitionBased()) { return new PartitionedLongFilter(); }/*from ww w . j a v a2 s . c om*/ int numValids = includeValues == null ? 0 : includeValues.size(); int numInvalids = excludeValues == null ? 0 : excludeValues.size(); SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids); if (includeValues != null) { for (BytesRef val : includeValues) { double dval = Double.parseDouble(val.utf8ToString()); result.addAccept(NumericUtils.doubleToSortableLong(dval)); } } if (excludeValues != null) { for (BytesRef val : excludeValues) { double dval = Double.parseDouble(val.utf8ToString()); result.addReject(NumericUtils.doubleToSortableLong(dval)); } } return result; }
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java
License:Apache License
@Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { if (include != null) { builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString()); } else if (includeValues != null) { builder.startArray(INCLUDE_FIELD.getPreferredName()); for (BytesRef value : includeValues) { builder.value(value.utf8ToString()); }/*from ww w . j a va2 s. co m*/ builder.endArray(); } else if (isPartitionBased()) { builder.startObject(INCLUDE_FIELD.getPreferredName()); builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition); builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions); builder.endObject(); } if (exclude != null) { builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString()); } else if (excludeValues != null) { builder.startArray(EXCLUDE_FIELD.getPreferredName()); for (BytesRef value : excludeValues) { builder.value(value.utf8ToString()); } builder.endArray(); } return builder; }
From source file:org.codelibs.fess.helper.QueryHelper.java
License:Apache License
protected QueryBuilder convertTermRangeQuery(final QueryContext context, final TermRangeQuery termRangeQuery, final float boost) { final String field = getSearchField(context, termRangeQuery.getField()); if (isSearchField(field)) { context.addFieldLog(field, termRangeQuery.toString(field)); final RangeQueryBuilder rangeQuery = QueryBuilders.rangeQuery(field); final BytesRef min = termRangeQuery.getLowerTerm(); if (min != null) { if (termRangeQuery.includesLower()) { rangeQuery.gte(min.utf8ToString()); } else { rangeQuery.gt(min.utf8ToString()); }/*w w w . j av a2 s.co m*/ } final BytesRef max = termRangeQuery.getUpperTerm(); if (max != null) { if (termRangeQuery.includesUpper()) { rangeQuery.lte(max.utf8ToString()); } else { rangeQuery.lt(max.utf8ToString()); } } rangeQuery.boost(boost); return rangeQuery; } else { final String origQuery = termRangeQuery.toString(); context.addFieldLog(Constants.DEFAULT_FIELD, origQuery); context.addHighlightedQuery(origQuery); return buildDefaultQueryBuilder((f, b) -> QueryBuilders.matchPhraseQuery(f, origQuery).boost(b)); } }
From source file:org.deshang.content.indexing.scheduling.ContentIndexingTask.java
License:Apache License
private void calcPersonTermDocFreqInfo(TermDocFreqStatistics statistics, IndexReader reader) throws IOException { long docNum = reader.numDocs(); LOGGER.debug("Total number of documents is " + docNum + "."); List<AtomicReaderContext> atomicCtxList = reader.leaves(); for (AtomicReaderContext ctx : atomicCtxList) { FilterAtomicReader far = new FilterAtomicReader(ctx.reader()); for (String field : far.fields()) { Terms terms = far.fields().terms(field); LOGGER.debug("Reader [" + far.toString() + "] totally has " + terms.size() + " term(s)."); TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; while ((term = termsEnum.next()) != null) { String termUtf8String = term.utf8ToString(); int existPersonDocFreq = statistics.getTermPersonDocFreq(termUtf8String); int personDocFreq = far.docFreq(new Term(field, term)); double personDocFreqPercent = ((double) personDocFreq) / docNum; if (existPersonDocFreq < 0) { personDocFreq += statistics.getTermPersonDocFreq(termUtf8String); personDocFreqPercent += statistics.getTermPersonDocFreqPercent(termUtf8String); }//ww w . j a v a 2s . c o m statistics.putTermPersonDocFreqInfo(termUtf8String, personDocFreq, personDocFreqPercent); } } far.close(); } }
From source file:org.dkpro.tc.features.ngram.LuceneNgramDocumentTest.java
License:Apache License
private Set<String> getTokensFromIndex(File luceneFolder) throws Exception { Set<String> token = new HashSet<>(); @SuppressWarnings("deprecation") IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder)); Fields fields = MultiFields.getFields(idxReader); for (String field : fields) { if (field.equals("id")) { continue; }//from w w w . ja v a 2 s . c om Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { token.add(text.utf8ToString()); } } return token; }
From source file:org.dkpro.tc.features.ngram.meta.LuceneCharacterNGramMetaCollectorTest.java
License:Apache License
@SuppressWarnings("unused") @Test/*from w w w . ja v a2s.c om*/ public void luceneCharacterNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "charMetaCollectorTest.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneCharacterNGramMetaCollector.class, LuceneCharacterNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneCharacterNGram.PARAM_NGRAM_MIN_N, 2, LuceneCharacterNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir, LuceneCharacterNGram.PARAM_SOURCE_LOCATION, tmpDir); for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) { // System.out.println(jcas.getDocumentText().length()); } Set<String> freq2terms = new HashSet<>(); int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneCharacterNGramMetaCollector.LUCENE_CHAR_NGRAM_FIELD + "123"); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { if (termsEnum.totalTermFreq() == 2) { freq2terms.add(text.utf8ToString()); } // System.out.println(text.utf8ToString() + " " + termsEnum.totalTermFreq()); i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(10, i); assertEquals(1, freq2terms.size()); }
From source file:org.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java
License:Apache License
@Test public void luceneNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramMetaCollector.class, LuceneNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir, LuceneNGramMetaCollector.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME); for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) { System.out.println(jcas.getDocumentText().length()); }// w w w. j av a2 s . c om int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(35, i); }
From source file:org.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java
License:Apache License
@Test public void combinedNgramPairMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramCPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);//from w w w . j a va 2 s. c o m // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); } int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); // if there were multiple instances of the same ngram, // then this would be relevant if (text.utf8ToString().equals("mice_ANDcats_.")) { assertEquals(1, termsEnum.docFreq()); assertEquals(1, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(65, i); }
From source file:org.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java
License:Apache License
@Test public void lucenePairNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);// w w w. jav a 2 s . com // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); } int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(16, i); }
From source file:org.elasticsearch.action.termlist.TransportTermlistAction.java
License:Apache License
@Override protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException { synchronized (termlistMutex) { InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index()) .shardSafe(request.shardId()); indexShard.store().directory();/*ww w .ja va2 s .com*/ Engine.Searcher searcher = indexShard.searcher(); try { Set<String> set = new CompactHashSet(); Fields fields = MultiFields.getFields(searcher.reader()); if (fields != null) { for (Iterator<String> it = fields.iterator(); it.hasNext();) { String field = it.next(); if (field.charAt(0) == '_') { continue; } if (request.getField() == null || field.equals(request.getField())) { Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { set.add(text.utf8ToString()); System.out.println("field=" + field + "; text=" + text.utf8ToString()); } } } } } return new ShardTermlistResponse(request.index(), request.shardId(), set); } catch (IOException ex) { throw new ElasticSearchException(ex.getMessage(), ex); } } }