List of usage examples for org.apache.lucene.index Terms getSumDocFreq
public abstract long getSumDocFreq() throws IOException;
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Returns the average number of unique terms times the number of docs belonging to the input class * @param term the term representing the class * @return the average number of unique terms * @throws IOException if a low level I/O problem happens *///w w w .ja va 2 s.c o m private double getTextTermFreqForClass(Term term) throws IOException { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { Terms terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc } int docsWithC = leafReader.docFreq(term); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
/** * Returns the average number of unique terms times the number of docs belonging to the input class * * @param term the class term/* www. j av a2s . c om*/ * @return the average number of unique terms * @throws java.io.IOException If there is a low-level I/O error */ private double getTextTermFreqForClass(Term term, String fieldName) throws IOException { double avgNumberOfUniqueTerms; Terms terms = MultiFields.getTerms(leafReader, fieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc int docsWithC = leafReader.docFreq(term); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c }
From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java
License:Apache License
/** * This function is building the frame of the cache. The cache is storing the * word occurrences to the memory after those searched once. This cache can * made 2-100x speedup in proper use, but can eat lot of memory. There is an * option to lower the memory consume, if a word have really low occurrence in * the index you could filter it out. The other parameter is switching between * the term searching, if it true, just the terms in the skeleton will be * searched, but if it false the terms whoes not in the cache will be searched * out too (but not cached)./*from ww w. ja va2 s . c om*/ * * @param minTermOccurrenceInCache Lower cache size with higher value. * @param justCachedTerms The switch for fully exclude low occurrence docs. * @throws IOException If there is a low-level I/O error. */ public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException { this.justCachedTerms = justCachedTerms; this.docsWithClassSize = countDocsWithClass(); termCClassHitCache.clear(); cclasses.clear(); classTermFreq.clear(); // build the cache for the word Map<String, Long> frequencyMap = new HashMap<>(); for (String textFieldName : textFieldNames) { TermsEnum termsEnum = leafReader.terms(textFieldName).iterator(); while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); String termText = term.utf8ToString(); long frequency = termsEnum.docFreq(); Long lastfreq = frequencyMap.get(termText); if (lastfreq != null) frequency += lastfreq; frequencyMap.put(termText, frequency); } } for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) { if (entry.getValue() > minTermOccurrenceInCache) { termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>()); } } // fill the class list Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { cclasses.add(BytesRef.deepCopyOf(termsEnum.term())); } // fill the classTermFreq map for (BytesRef cclass : cclasses) { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); } int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass)); classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC); } }
From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java
License:Apache License
private double getTextTermFreqForClass(BytesRef c) throws IOException { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { Terms terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc }// ww w .j a va 2 s . co m int docsWithC = leafReader.docFreq(new Term(classFieldName, c)); return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text fields per doc * # docs with c }
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();/*from ww w.j av a2 s . c o m*/ MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.TestBlockPostingsFormat3.java
License:Apache License
/** * checks collection-level statistics on Terms */// www . j a va2s . c o m public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception { if (leftTerms.getDocCount() != -1 && rightTerms.getDocCount() != -1) { assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount()); } if (leftTerms.getSumDocFreq() != -1 && rightTerms.getSumDocFreq() != -1) { assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq()); } if (leftTerms.getSumTotalTermFreq() != -1 && rightTerms.getSumTotalTermFreq() != -1) { assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq()); } if (leftTerms.size() != -1 && rightTerms.size() != -1) { assertEquals(leftTerms.size(), rightTerms.size()); } }
From source file:org.apache.solr.uninverting.FieldCacheImpl.java
License:Apache License
public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException { // not a general purpose filtering mechanism... assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX; SortedSetDocValues dv = reader.getSortedSetDocValues(field); if (dv != null) { return dv; }//w ww .j a va 2 s.c o m SortedDocValues sdv = reader.getSortedDocValues(field); if (sdv != null) { return DocValues.singleton(sdv); } final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptySortedSet(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptySortedSet(); } // ok we need to uninvert. check if we can optimize a bit. Terms terms = reader.terms(field); if (terms == null) { return DocValues.emptySortedSet(); } else { // if #postings = #docswithfield we know that the field is "single valued enough". // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency. // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf long numPostings = terms.getSumDocFreq(); if (numPostings != -1 && numPostings == terms.getDocCount()) { return DocValues.singleton(getTermsIndex(reader, field)); } } DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix)); return dto.iterator(reader); }
From source file:org.codelibs.elasticsearch.index.mapper.MappedFieldType.java
License:Apache License
/** * @return a {FieldStats} instance that maps to the type of this * field or {@code null} if the provided index has no stats about the * current field/*from www . ja v a 2 s. co m*/ */ public FieldStats stats(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); FieldInfo fi = MultiFields.getMergedFieldInfos(reader).fieldInfo(name()); if (fi == null) { return null; } Terms terms = MultiFields.getTerms(reader, name()); if (terms == null) { return new FieldStats.Text(maxDoc, 0, -1, -1, isSearchable(), isAggregatable()); } FieldStats stats = new FieldStats.Text(maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), isSearchable(), isAggregatable(), terms.getMin(), terms.getMax()); return stats; }
From source file:org.elasticsearch.action.termvector.GetTermVectorCheckDocFreqTests.java
License:Apache License
private void checkWithoutFieldStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(true) .setFieldStatistics(false).setSelectedFields(); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) -1)); assertThat(terms.getDocCount(), Matchers.equalTo(-1)); assertThat(terms.getSumDocFreq(), equalTo((long) -1)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); if (string.equals("the")) { assertThat("expected ttf of " + string, numDocs * 2, equalTo((int) iterator.totalTermFreq())); } else {//from w w w . j a v a 2 s . co m assertThat("expected ttf of " + string, numDocs, equalTo((int) iterator.totalTermFreq())); } DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(numDocs)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = new XContentFactory().jsonBuilder(); response.toXContent(xBuilder, null); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8(); String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"terms\":{\"brown\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"doc_freq\":15,\"ttf\":15,\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"doc_freq\":15,\"ttf\":30,\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }
From source file:org.elasticsearch.action.termvector.GetTermVectorCheckDocFreqTests.java
License:Apache License
private void checkWithoutTermStatistics(int numDocs, String[] values, int[] freq, int[][] pos, int[][] startOffset, int[][] endOffset, int i) throws IOException { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) .setPayloads(true).setOffsets(true).setPositions(true).setTermStatistics(false) .setFieldStatistics(true).setSelectedFields(); assertThat(resp.request().termStatistics(), equalTo(false)); TermVectorResponse response = resp.execute().actionGet(); assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); Terms terms = fields.terms("field"); assertThat(terms.size(), equalTo(8l)); assertThat(terms.getSumTotalTermFreq(), Matchers.equalTo((long) (9 * numDocs))); assertThat(terms.getDocCount(), Matchers.equalTo(numDocs)); assertThat(terms.getSumDocFreq(), equalTo((long) numDocs * values.length)); TermsEnum iterator = terms.iterator(null); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, Matchers.notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, Matchers.notNullValue()); assertThat("expected ttf of " + string, -1, equalTo((int) iterator.totalTermFreq())); DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); assertThat(iterator.docFreq(), equalTo(-1)); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); }//from w ww .j a v a2 s .c om } assertThat(iterator.next(), Matchers.nullValue()); XContentBuilder xBuilder = new XContentFactory().jsonBuilder(); response.toXContent(xBuilder, null); BytesStream bytesStream = xBuilder.bytesStream(); String utf8 = bytesStream.bytes().toUtf8(); String expectedString = "{\"_index\":\"test\",\"_type\":\"type1\",\"_id\":\"" + i + "\",\"_version\":1,\"found\":true,\"term_vectors\":{\"field\":{\"field_statistics\":{\"sum_doc_freq\":120,\"doc_count\":15,\"sum_ttf\":135},\"terms\":{\"brown\":{\"term_freq\":1,\"tokens\":[{\"position\":2,\"start_offset\":10,\"end_offset\":15,\"payload\":\"d29yZA==\"}]},\"dog\":{\"term_freq\":1,\"tokens\":[{\"position\":8,\"start_offset\":40,\"end_offset\":43,\"payload\":\"d29yZA==\"}]},\"fox\":{\"term_freq\":1,\"tokens\":[{\"position\":3,\"start_offset\":16,\"end_offset\":19,\"payload\":\"d29yZA==\"}]},\"jumps\":{\"term_freq\":1,\"tokens\":[{\"position\":4,\"start_offset\":20,\"end_offset\":25,\"payload\":\"d29yZA==\"}]},\"lazy\":{\"term_freq\":1,\"tokens\":[{\"position\":7,\"start_offset\":35,\"end_offset\":39,\"payload\":\"d29yZA==\"}]},\"over\":{\"term_freq\":1,\"tokens\":[{\"position\":5,\"start_offset\":26,\"end_offset\":30,\"payload\":\"d29yZA==\"}]},\"quick\":{\"term_freq\":1,\"tokens\":[{\"position\":1,\"start_offset\":4,\"end_offset\":9,\"payload\":\"d29yZA==\"}]},\"the\":{\"term_freq\":2,\"tokens\":[{\"position\":0,\"start_offset\":0,\"end_offset\":3,\"payload\":\"d29yZA==\"},{\"position\":6,\"start_offset\":31,\"end_offset\":34,\"payload\":\"d29yZA==\"}]}}}}}"; assertThat(utf8, equalTo(expectedString)); }