Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:com.github.flaxsearch.api.PositionData.java

License:Apache License

static String payloadToString(BytesRef payload) {
    if (payload == null)
        return null;
    return payload.utf8ToString();
}

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public Map<Integer, Set<HighlightTerm>> highlight(Set<Integer> docIds, Set<String> fields) {
    try {/*from  www .  j  av a2  s. c  o m*/
        Map<Integer, Set<HighlightTerm>> termMap = new HashMap<Integer, Set<HighlightTerm>>();
        Map<Integer, Document> idToDocMap = new HashMap<Integer, Document>();
        Map<Integer, CoordinateStorage> idToCoordinateStorageMap = new HashMap<Integer, CoordinateStorage>();

        Map<Integer, Set<Integer>> myLeaves = new HashMap<Integer, Set<Integer>>();
        for (int docId : docIds) {
            Document luceneDoc = indexSearcher.doc(docId);
            idToDocMap.put(docId, luceneDoc);
            JochreIndexDocument jochreDoc = searchService.getJochreIndexDocument(indexSearcher, docId);
            idToCoordinateStorageMap.put(docId, jochreDoc.getCoordinateStorage());
            termMap.put(docId, new TreeSet<HighlightTerm>());
            int leaf = ReaderUtil.subIndex(docId, leaves);
            Set<Integer> docsPerLeaf = myLeaves.get(leaf);
            if (docsPerLeaf == null) {
                docsPerLeaf = new HashSet<Integer>();
                myLeaves.put(leaf, docsPerLeaf);
            }
            docsPerLeaf.add(docId);
        }

        for (int leaf : myLeaves.keySet()) {
            if (LOG.isTraceEnabled())
                LOG.trace("Searching leaf " + leaf);
            Set<Integer> docsPerLeaf = myLeaves.get(leaf);
            AtomicReaderContext subContext = leaves.get(leaf);
            AtomicReader atomicReader = subContext.reader();

            int fieldCounter = 0;
            for (String field : fields) {
                fieldCounter++;
                if (LOG.isTraceEnabled())
                    LOG.trace("Field " + fieldCounter + ": " + field);

                Terms atomicReaderTerms = atomicReader.terms(field);
                if (atomicReaderTerms == null) {
                    continue; // nothing to do
                }
                TermsEnum termsEnum = atomicReaderTerms.iterator(TermsEnum.EMPTY);

                int termCounter = 0;
                for (BytesRef term : terms) {
                    termCounter++;
                    if (LOG.isTraceEnabled())
                        LOG.trace("Searching for term " + termCounter + ": " + term.utf8ToString()
                                + " in field " + field);

                    if (!termsEnum.seekExact(term)) {
                        continue; // term not found
                    }

                    DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(null, null,
                            DocsAndPositionsEnum.FLAG_OFFSETS);
                    int relativeDocId = docPosEnum.nextDoc();
                    while (relativeDocId != DocsAndPositionsEnum.NO_MORE_DOCS) {
                        int docId = subContext.docBase + relativeDocId;
                        if (docsPerLeaf.contains(docId)) {
                            Document doc = idToDocMap.get(docId);
                            Set<HighlightTerm> highlightTerms = termMap.get(docId);
                            //Retrieve the term frequency in the current document
                            int freq = docPosEnum.freq();
                            if (LOG.isTraceEnabled()) {
                                String extId = doc.get("id");
                                String path = doc.get("path");
                                LOG.trace("Found " + freq + " matches for doc " + docId + ", extId: " + extId
                                        + ", path: " + path);
                            }

                            for (int i = 0; i < freq; i++) {
                                int position = docPosEnum.nextPosition();
                                int start = docPosEnum.startOffset();
                                int end = docPosEnum.endOffset();

                                if (LOG.isTraceEnabled())
                                    LOG.trace("Found match " + position + " at docId " + docId + ", field "
                                            + field + " start=" + start + ", end=" + end);

                                CoordinateStorage coordinateStorage = idToCoordinateStorageMap.get(docId);
                                int imageIndex = coordinateStorage.getImageIndex(start);
                                int pageIndex = coordinateStorage.getPageIndex(start);

                                HighlightTerm highlightTerm = new HighlightTerm(docId, field, start, end,
                                        imageIndex, pageIndex);
                                highlightTerm.setWeight(this.weigh(term));
                                if (highlightTerm.getWeight() > 0)
                                    highlightTerms.add(highlightTerm);
                            }
                        }
                        relativeDocId = docPosEnum.nextDoc();
                    }
                } // next term
            } // next field
        } // next index leaf to search

        return termMap;
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.mathworks.xzheng.advsearching.BooksLikeThis.java

License:Apache License

public Document[] docsLike(int id, int max) throws IOException {
    Document doc = reader.document(id);

    String[] authors = doc.getValues("author");
    BooleanQuery authorQuery = new BooleanQuery(); // #3
    for (String author : authors) { // #3
        authorQuery.add(new TermQuery(new Term("author", author)), // #3
                BooleanClause.Occur.SHOULD); // #3
    }/*  w  w w .j  a v  a2 s.c o m*/
    authorQuery.setBoost(2.0f);

    Terms terms = reader.getTermVector(id, "subject"); // #4

    BooleanQuery subjectQuery = new BooleanQuery(); // #4
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef text;
    while ((text = termsEnum.next()) != null) { // #4
        TermQuery tq = new TermQuery( // #4
                new Term("subject", text.utf8ToString())); // #4
        subjectQuery.add(tq, BooleanClause.Occur.SHOULD); // #4
    }

    BooleanQuery likeThisQuery = new BooleanQuery(); // #5
    likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD); // #5
    likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD); // #5

    likeThisQuery.add(new TermQuery( // #6
            new Term("isbn", doc.get("isbn"))), BooleanClause.Occur.MUST_NOT); // #6

    // System.out.println("  Query: " +
    //    likeThisQuery.toString("contents"));
    TopDocs hits = searcher.search(likeThisQuery, 10);
    int size = max;
    if (max > hits.scoreDocs.length)
        size = hits.scoreDocs.length;

    Document[] docs = new Document[size];
    for (int i = 0; i < size; i++) {
        docs[i] = reader.document(hits.scoreDocs[i].doc);
    }

    return docs;
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

/**
 * {@inheritDoc}//from  w  w  w  .j  a  v a2  s . c om
 */
@Override
public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer,
        Query query) throws IOException {
    this.textTerms = MultiFields.getTerms(leafReader, textFieldName);

    if (textTerms == null) {
        throw new IOException("term vectors need to be available for field " + textFieldName);
    }

    this.analyzer = analyzer;
    this.textFieldName = textFieldName;

    if (threshold == null || threshold == 0d) {
        // automatic assign a threshold
        long sumDocFreq = leafReader.getSumDocFreq(textFieldName);
        if (sumDocFreq != -1) {
            this.threshold = (double) sumDocFreq / 2d;
        } else {
            throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName
                    + " do not exist");
        }
    }

    // TODO : remove this map as soon as we have a writable FST
    SortedMap<String, Double> weights = new TreeMap<>();

    TermsEnum termsEnum = textTerms.iterator();
    BytesRef textTerm;
    while ((textTerm = termsEnum.next()) != null) {
        weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq());
    }
    updateFST(weights);

    IndexSearcher indexSearcher = new IndexSearcher(leafReader);

    int batchCount = 0;

    BooleanQuery q = new BooleanQuery();
    q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST));
    if (query != null) {
        q.add(new BooleanClause(query, BooleanClause.Occur.MUST));
    }
    // run the search and use stored field values
    for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) {
        Document doc = indexSearcher.doc(scoreDoc.doc);

        IndexableField textField = doc.getField(textFieldName);

        // get the expected result
        IndexableField classField = doc.getField(classFieldName);

        if (textField != null && classField != null) {
            // assign class to the doc
            ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue());
            Boolean assignedClass = classificationResult.getAssignedClass();

            Boolean correctClass = Boolean.valueOf(classField.stringValue());
            long modifier = correctClass.compareTo(assignedClass);
            if (modifier != 0) {
                updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier,
                        batchCount % batchSize == 0);
            }
            batchCount++;
        }
    }
    weights.clear(); // free memory while waiting for GC
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass,
        SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException {
    TermsEnum cte = textTerms.iterator();

    // get the doc term vectors
    Terms terms = leafReader.getTermVector(docId, textFieldName);

    if (terms == null) {
        throw new IOException("term vectors must be stored for field " + textFieldName);
    }/*from  w ww.  ja v  a 2  s.  co  m*/

    TermsEnum termsEnum = terms.iterator();

    BytesRef term;

    while ((term = termsEnum.next()) != null) {
        cte.seekExact(term);
        if (assignedClass != null) {
            long termFreqLocal = termsEnum.totalTermFreq();
            // update weights
            Long previousValue = Util.get(fst, term);
            String termString = term.utf8ToString();
            weights.put(termString, previousValue + modifier * termFreqLocal);
        }
    }
    if (updateFST) {
        updateFST(weights);
    }
}

From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java

License:Apache License

/**
 * This function is building the frame of the cache. The cache is storing the
 * word occurrences to the memory after those searched once. This cache can
 * made 2-100x speedup in proper use, but can eat lot of memory. There is an
 * option to lower the memory consume, if a word have really low occurrence in
 * the index you could filter it out. The other parameter is switching between
 * the term searching, if it true, just the terms in the skeleton will be
 * searched, but if it false the terms whoes not in the cache will be searched
 * out too (but not cached).//from www. j a  v  a  2  s  .co m
 *
 * @param minTermOccurrenceInCache Lower cache size with higher value.
 * @param justCachedTerms          The switch for fully exclude low occurrence docs.
 * @throws IOException If there is a low-level I/O error.
 */
public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException {
    this.justCachedTerms = justCachedTerms;

    this.docsWithClassSize = countDocsWithClass();
    termCClassHitCache.clear();
    cclasses.clear();
    classTermFreq.clear();

    // build the cache for the word
    Map<String, Long> frequencyMap = new HashMap<>();
    for (String textFieldName : textFieldNames) {
        TermsEnum termsEnum = leafReader.terms(textFieldName).iterator();
        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termText = term.utf8ToString();
            long frequency = termsEnum.docFreq();
            Long lastfreq = frequencyMap.get(termText);
            if (lastfreq != null)
                frequency += lastfreq;
            frequencyMap.put(termText, frequency);
        }
    }
    for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) {
        if (entry.getValue() > minTermOccurrenceInCache) {
            termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>());
        }
    }

    // fill the class list
    Terms terms = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum termsEnum = terms.iterator();
    while ((termsEnum.next()) != null) {
        cclasses.add(BytesRef.deepCopyOf(termsEnum.term()));
    }
    // fill the classTermFreq map
    for (BytesRef cclass : cclasses) {
        double avgNumberOfUniqueTerms = 0;
        for (String textFieldName : textFieldNames) {
            terms = MultiFields.getTerms(leafReader, textFieldName);
            long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
            avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount();
        }
        int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass));
        classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC);
    }
}

From source file:com.meltwater.elasticsearch.shard.BatchQueriesLoaderCollector.java

License:Apache License

@Override
public void collect(int doc) throws IOException {
    idValues.setDocument(doc);/*from w  w  w  .  j a va 2  s  .  com*/
    if (idValues.count() > 0) {
        assert idValues.count() == 1;
        BytesRef id = idValues.valueAt(0);
        fieldsVisitor.reset();
        reader.document(doc, fieldsVisitor);
        try {
            // id is only used for logging, if we fail we log the id in the catch statement
            final QueryAndSource queryAndSource = percolator.parsePercolatorDocument(null,
                    fieldsVisitor.source());
            queries.put(id.utf8ToString(), queryAndSource);
        } catch (Exception e) {
            logger.warn("failed to add query [{}]", e, id.utf8ToString());
        }

    } else {
        logger.error("failed to load query since field [{}] not present", ID_FIELD);
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java

License:Apache License

@Override
public void testInvertedWrite() throws Exception {
    Directory dir = newDirectory();//  w  w  w .j av  a  2s  .  c o  m
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);

    // Must be concurrent because thread(s) can be merging
    // while up to one thread flushes, and each of those
    // threads iterates over the map while the flushing
    // thread might be adding to it:
    final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();

    final AtomicLong sumDocFreq = new AtomicLong();
    final AtomicLong sumTotalTermFreq = new AtomicLong();

    // TODO: would be better to use / delegate to the current
    // Codec returned by getCodec()

    iwc.setCodec(new AssertingCodec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {

            PostingsFormat p = getCodec().postingsFormat();
            if (p instanceof PerFieldPostingsFormat) {
                p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            if (p instanceof RocanaPerFieldPostingsFormat) {
                p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            final PostingsFormat defaultPostingsFormat = p;

            final Thread mainThread = Thread.currentThread();

            if (field.equals("body")) {

                // A PF that counts up some stats and then in
                // the end we verify the stats match what the
                // final IndexReader says, just to exercise the
                // new freedom of iterating the postings more
                // than once at flush/merge:

                return new PostingsFormat(defaultPostingsFormat.getName()) {

                    @Override
                    public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {

                        final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);

                        return new FieldsConsumer() {
                            @Override
                            public void write(Fields fields) throws IOException {
                                fieldsConsumer.write(fields);

                                boolean isMerge = state.context.context == IOContext.Context.MERGE;

                                // We only use one thread for flushing
                                // in this test:
                                assert isMerge || Thread.currentThread() == mainThread;

                                // We iterate the provided TermsEnum
                                // twice, so we excercise this new freedom
                                // with the inverted API; if
                                // addOnSecondPass is true, we add up
                                // term stats on the 2nd iteration:
                                boolean addOnSecondPass = random().nextBoolean();

                                //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);

                                // Gather our own stats:
                                Terms terms = fields.terms("body");
                                assert terms != null;

                                TermsEnum termsEnum = terms.iterator();
                                PostingsEnum docs = null;
                                while (termsEnum.next() != null) {
                                    BytesRef term = termsEnum.term();
                                    // TODO: also sometimes ask for payloads/offsets?
                                    boolean noPositions = random().nextBoolean();
                                    if (noPositions) {
                                        docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                    } else {
                                        docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                    }
                                    int docFreq = 0;
                                    long totalTermFreq = 0;
                                    while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                        docFreq++;
                                        totalTermFreq += docs.freq();
                                        int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                        if (!noPositions) {
                                            for (int i = 0; i < limit; i++) {
                                                docs.nextPosition();
                                            }
                                        }
                                    }

                                    String termString = term.utf8ToString();

                                    // During merge we should only see terms
                                    // we had already seen during a
                                    // previous flush:
                                    assertTrue(isMerge == false || termFreqs.containsKey(termString));

                                    if (isMerge == false) {
                                        if (addOnSecondPass == false) {
                                            TermFreqs tf = termFreqs.get(termString);
                                            if (tf == null) {
                                                tf = new TermFreqs();
                                                termFreqs.put(termString, tf);
                                            }
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        } else if (termFreqs.containsKey(termString) == false) {
                                            // Add placeholder (2nd pass will
                                            // set its counts):
                                            termFreqs.put(termString, new TermFreqs());
                                        }
                                    }
                                }

                                // Also test seeking the TermsEnum:
                                for (String term : termFreqs.keySet()) {
                                    if (termsEnum.seekExact(new BytesRef(term))) {
                                        // TODO: also sometimes ask for payloads/offsets?
                                        boolean noPositions = random().nextBoolean();
                                        if (noPositions) {
                                            docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                        } else {
                                            docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                        }

                                        int docFreq = 0;
                                        long totalTermFreq = 0;
                                        while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                            docFreq++;
                                            totalTermFreq += docs.freq();
                                            int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                            if (!noPositions) {
                                                for (int i = 0; i < limit; i++) {
                                                    docs.nextPosition();
                                                }
                                            }
                                        }

                                        if (isMerge == false && addOnSecondPass) {
                                            TermFreqs tf = termFreqs.get(term);
                                            assert tf != null;
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        }

                                        //System.out.println("  term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
                                        assertTrue(docFreq <= termFreqs.get(term).docFreq);
                                        assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
                                    }
                                }

                                // Also test seekCeil
                                for (int iter = 0; iter < 10; iter++) {
                                    BytesRef term = new BytesRef(
                                            TestUtil.randomRealisticUnicodeString(random()));
                                    SeekStatus status = termsEnum.seekCeil(term);
                                    if (status == SeekStatus.NOT_FOUND) {
                                        assertTrue(term.compareTo(termsEnum.term()) < 0);
                                    }
                                }
                            }

                            @Override
                            public void close() throws IOException {
                                fieldsConsumer.close();
                            }
                        };
                    }

                    @Override
                    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
                        return defaultPostingsFormat.fieldsProducer(state);
                    }
                };
            } else {
                return defaultPostingsFormat;
            }
        }
    });

    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

    LineFileDocs docs = new LineFileDocs(random());
    int bytesToIndex = atLeast(100) * 1024;
    int bytesIndexed = 0;
    while (bytesIndexed < bytesToIndex) {
        Document doc = docs.nextDoc();
        w.addDocument(doc);
        bytesIndexed += RamUsageTester.sizeOf(doc);
    }

    IndexReader r = w.getReader();
    w.close();

    Terms terms = MultiFields.getTerms(r, "body");
    assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
    assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());

    TermsEnum termsEnum = terms.iterator();
    long termCount = 0;
    boolean supportsOrds = true;
    while (termsEnum.next() != null) {
        BytesRef term = termsEnum.term();
        assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
        assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
        if (supportsOrds) {
            long ord;
            try {
                ord = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                ord = -1;
            }
            if (ord != -1) {
                assertEquals(termCount, ord);
            }
        }
        termCount++;
    }
    assertEquals(termFreqs.size(), termCount);

    r.close();
    dir.close();
}

From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java

License:Apache License

String brToString(BytesRef b) {
    if (b == null) {
        return "null";
    } else {/*from   w w w .ja  v  a  2 s .  c om*/
        try {
            return b.utf8ToString() + " " + b;
        } catch (Throwable t) {
            // If BytesRef isn't actually UTF8, or it's eg a
            // prefix of UTF8 that ends mid-unicode-char, we
            // fallback to hex:
            return b.toString();
        }
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java

License:Apache License

@SuppressWarnings("unused")
static String brToString(BytesRef b) {
    try {//from w w  w  .  j  a va  2  s  . c  o m
        return b.utf8ToString() + " " + b;
    } catch (Throwable t) {
        // If BytesRef isn't actually UTF8, or it's eg a
        // prefix of UTF8 that ends mid-unicode-char, we
        // fallback to hex:
        return b.toString();
    }
}