Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:org.opengrok.suggest.SuggesterSearcher.java

License:Open Source License

private List<LookupResultItem> suggest(final Query query, final LeafReaderContext leafReaderContext,
        final String project, final SuggesterQuery suggesterQuery, final PopularityCounter searchCounts)
        throws IOException {
    if (Thread.currentThread().isInterrupted()) {
        interrupted = true;/*from   w w w  .  j  a  va  2  s. c  om*/
        return Collections.emptyList();
    }

    boolean shouldLeaveOutSameTerms = shouldLeaveOutSameTerms(query, suggesterQuery);
    Set<BytesRef> tokensAlreadyIncluded = null;
    if (shouldLeaveOutSameTerms) {
        tokensAlreadyIncluded = SuggesterUtils.intoTermsExceptPhraseQuery(query).stream()
                .filter(t -> t.field().equals(suggesterQuery.getField())).map(Term::bytes)
                .collect(Collectors.toSet());
    }

    boolean needsDocumentIds = query != null && !(query instanceof MatchAllDocsQuery);

    ComplexQueryData complexQueryData = null;
    if (needsDocumentIds) {
        complexQueryData = getComplexQueryData(query, leafReaderContext);
        if (interrupted) {
            return Collections.emptyList();
        }
    }

    Terms terms = leafReaderContext.reader().terms(suggesterQuery.getField());

    TermsEnum termsEnum = suggesterQuery.getTermsEnumForSuggestions(terms);

    LookupPriorityQueue queue = new LookupPriorityQueue(resultSize);

    boolean needPositionsAndFrequencies = needPositionsAndFrequencies(query);

    PostingsEnum postingsEnum = null;

    BytesRef term = termsEnum.next();
    while (term != null) {
        if (Thread.currentThread().isInterrupted()) {
            interrupted = true;
            break;
        }

        if (needPositionsAndFrequencies) {
            postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.POSITIONS | PostingsEnum.FREQS);
        } else {
            postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        }

        int score;
        if (!needsDocumentIds) {
            score = normalizeDocumentFrequency(termsEnum.docFreq(), numDocs);
        } else if (needPositionsAndFrequencies) {
            score = getPhraseScore(complexQueryData, leafReaderContext.docBase, postingsEnum);
        } else {
            score = getDocumentFrequency(complexQueryData.documentIds, leafReaderContext.docBase, postingsEnum);
        }

        if (score > 0) {
            if (!shouldLeaveOutSameTerms || !tokensAlreadyIncluded.contains(term)) {
                score += searchCounts.get(term) * TERM_ALREADY_SEARCHED_MULTIPLIER;

                if (queue.canInsert(score)) {
                    queue.insertWithOverflow(new LookupResultItem(term.utf8ToString(), project, score));
                }
            }
        }

        term = termsEnum.next();
    }

    return queue.getResult();
}

From source file:org.opensextant.solrtexttagger.Tagger.java

License:Open Source License

public void process() throws IOException {
    if (terms == null)
        return;// w w  w.  j  a  va 2s  . co  m

    //a shared pointer to the head used by this method and each Tag instance.
    final TagLL[] head = new TagLL[1];

    TermPrefixCursor cursor = null;//re-used
    TermsEnum termsEnum = null;//re-used

    //boolean switch used to log warnings in case tokens where skipped during
    //tagging.
    boolean skippedTokens = false;
    while (tokenStream.incrementToken()) {
        if (log.isTraceEnabled()) {
            log.trace("Token: {}, posInc: {},  offset: [{},{}]", byteRefAtt, posIncAtt.getPositionIncrement(),
                    offsetAtt.startOffset(), offsetAtt.endOffset());
        }
        //check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
        if (posIncAtt.getPositionIncrement() < 1) {
            //(a) Deal with this as a configuration issue and throw an exception
            if (!skipAltTokens) {
                //TODO throw UnsupportedTokenException when PhraseBuilder is ported
                throw new IllegalStateException("Query Analyzer generates alternate "
                        + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " + "enable '"
                        + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " + "tokens. NOTE: enabling '"
                        + TaggerRequestHandler.SKIP_ALT_TOKENS
                        + "' might result in wrong tagging results if the index time analyzer "
                        + "is not configured accordingly. For detailed information see "
                        + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
            } else {
                //(b) In case the index time analyser had indexed all variants (users
                //    need to ensure that) processing of alternate tokens can be skipped
                //    as anyways all alternatives will be contained in the FST.
                skippedTokens = true;
                log.trace("  ... ignored token");
                continue;
            }
        }
        //-- If PositionIncrement > 1 (stopwords)
        if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
            log.trace("   - posInc > 1 ... mark cluster as done");
            advanceTagsAndProcessClusterIfDone(head, null);
        }

        final BytesRef term;
        //NOTE: we need to lookup tokens if
        // * the LookupAtt is true OR
        // * there are still advancing tags (to find the longest possible match)
        if (lookupAtt.isTaggable() || head[0] != null) {
            //-- Lookup the term id from the next token
            term = byteRefAtt.getBytesRef();
            if (term.length == 0) {
                throw new IllegalArgumentException(
                        "term: " + term.utf8ToString() + " analyzed to a zero-length token");
            }
        } else { //no current cluster AND lookup == false ...
            term = null; //skip this token
        }

        //-- Process tag
        advanceTagsAndProcessClusterIfDone(head, term);

        //-- only create new Tags for Tokens we need to lookup
        if (lookupAtt.isTaggable() && term != null) {

            //determine if the terms index has a term starting with the provided term
            // TODO cache hashcodes of valid first terms (directly from char[]?) to skip lookups?
            termsEnum = terms.iterator();
            if (cursor == null)//re-usable
                cursor = new TermPrefixCursor(termsEnum, liveDocs, docIdsCache);
            if (cursor.advance(term)) {
                TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
                termsEnum = null;//because the cursor now "owns" this instance
                cursor = null;//because the new tag now "owns" this instance
                //and add it to the end
                if (head[0] == null) {
                    head[0] = newTail;
                } else {
                    for (TagLL t = head[0]; true; t = t.nextTag) {
                        if (t.nextTag == null) {
                            t.addAfterLL(newTail);
                            break;
                        }
                    }
                }
            }
        } //if termId >= 0
    } //end while(incrementToken())

    //-- Finish all tags
    advanceTagsAndProcessClusterIfDone(head, null);
    assert head[0] == null;

    if (!loggedSkippedAltTokenWarning && skippedTokens) {
        loggedSkippedAltTokenWarning = true; //only log once
        log.warn("The Tagger skiped some alternate tokens (tokens with posInc == 0) "
                + "while processing text. This may cause problems with some Analyer "
                + "configurations (e.g. query time synonym expansion). For details see "
                + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
    }

    tokenStream.end();
    //tokenStream.close(); caller closes because caller acquired it
}

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Loads all the fuzzy terms in the list of terms given the reader.
 *
 * @param reader Index reader to use.//  w  ww .  j  av a2  s  .c o  m
 * @param values The list of terms to load.
 * @param term   The term to use.
 *
 * @throws IOException If an error is thrown by the fuzzy term enumeration.
 */
public static void fuzzy(IndexReader reader, List<String> values, Term term, int minSimilarity)
        throws IOException {
    AttributeSource atts = new AttributeSource();
    Fields fields = MultiFields.getFields(reader);
    org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
    if (terms == null)
        return;
    FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, false);
    BytesRef val;
    BytesRef searched = term.bytes();
    while ((val = fuzzy.next()) != null) {
        if (!searched.bytesEquals(val))
            values.add(val.utf8ToString());
    }
}

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Loads all the prefix terms in the list of terms given the reader.
 *
 * @param reader  Index reader to use./*from  ww w.java 2 s. c  o  m*/
 * @param values  The list of values to load.
 * @param term    The term to use.
 *
 * @throws IOException If an error is thrown by the prefix term enumeration.
 */
public static void prefix(IndexReader reader, List<String> values, Term term) throws IOException {
    Fields fields = MultiFields.getFields(reader);
    org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field());
    if (terms == null)
        return;
    TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), null);
    BytesRef val;
    while ((val = prefixes.next()) != null) {
        values.add(val.utf8ToString());
    }
}

From source file:org.pageseeder.flint.lucene.search.Terms.java

License:Apache License

/**
 * Returns the list of term values for the specified field.
 *
 * @param reader The index reader to use
 * @param field  The field/*  w w  w .  j  av  a 2 s. c om*/
 *
 * @return the list of terms for this field
 *
 * @throws IOException should any IO error be reported.
 */
@Beta
public static List<String> values(IndexReader reader, String field) throws IOException {
    LOGGER.debug("Loading term values for field {}", field);
    List<String> values = new ArrayList<String>();
    org.apache.lucene.index.Terms terms = MultiFields.getTerms(reader, field);
    if (terms == null)
        return values;
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum == TermsEnum.EMPTY)
        return values;
    while (termsEnum.next() != null) {
        BytesRef t = termsEnum.term();
        if (t == null)
            break;
        values.add(t.utf8ToString());
    }
    return values;
}

From source file:org.solr.classtify.SimpleNaiveBayesClassifierTest.java

License:Apache License

@Test
public void classtify() throws IOException {
    SimpleNaiveBayesClassifier classifier = new SimpleNaiveBayesClassifier();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(dir)));

    AtomicReader wrap = SlowCompositeReaderWrapper.wrap(reader);
    classifier.train(wrap, textFieldName, categoryFieldName, analyzer);
    ClassificationResult<BytesRef> assignClass = classifier.assignClass(newText);
    BytesRef assignedClass = assignClass.getAssignedClass();

    double score = assignClass.getScore();
    System.out.println(assignedClass.utf8ToString() + "," + score);
}

From source file:org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder.java

License:Open Source License

/**
 * Determine the terms shared by the related variation points by looking up all terms included
 * in the search query AND a found document.
 *
 * @param referenceDocTerms/*  w ww  . j a  va  2s .c  o m*/
 *            The terms of the reference doc and used in the search query.
 * @param foundDoc
 *            A specific document found by the query.
 * @param foundDocId
 *            The id of the document found to get it's index terms.
 * @param field
 *            The field to get the terms for.
 *
 * @return The {@link Set} of terms shared between the query and the document.
 * @throws IOException
 */
private Set<String> determineSharedTerms(Set<Term> referenceDocTerms, Document foundDoc, int foundDocId,
        String field) throws IOException {
    Set<String> sharedTerms = new TreeSet<String>();
    Terms termVector = reader.getTermVector(foundDocId, field);
    TermsEnum termsEnum = null;
    TermsEnum iterator = termVector.iterator(termsEnum);
    BytesRef br = null;
    while ((br = iterator.next()) != null) {
        String term = br.utf8ToString();
        for (Term t : referenceDocTerms) {
            if (t.text().equals(term)) {
                sharedTerms.add(term);
            }
        }
    }
    return sharedTerms;
}

From source file:org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder.java

License:Open Source License

/**
 * Extracts the frequencies of all {@link Term}s in the specified {@link Document}. Uses the
 * member reader./*from  w  ww.ja v  a  2 s.  c o m*/
 *
 * @param docId
 *            The ID of the {@link Document} to extract the {@link Term}s from.
 * @param fieldName
 *            The name of the field to extract frequencies from.
 * @return A {@link Map} containing the terms as the key and the related frequencies as
 *         {@link Integer} value.
 */
private Map<String, Integer> getTermFrequencies(int docId, String fieldName) {
    Map<String, Integer> frequencies = new HashMap<String, Integer>();

    try {
        Terms vector = reader.getTermVector(docId, fieldName);
        if (vector == null) {
            return frequencies;
        }
        TermsEnum termsEnum = null;
        termsEnum = vector.iterator(termsEnum);
        BytesRef text = null;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();
            int freq = (int) termsEnum.totalTermFreq();
            frequencies.put(term, freq);
        }
    } catch (IOException e) {
        logger.error("Failure while extracting Term Frequencies.");
    }
    return frequencies;
}

From source file:org.splevo.vpm.analyzer.semantic.SemanticVPMAnalyzer.java

License:Open Source License

private Map<String, Integer> getTermsFromIndex() {
    Map<String, Integer> indexedTerms = Maps.newLinkedHashMap();
    try {/*from   ww w  . j  a va2s.c  o  m*/
        DirectoryReader indexReader = indexer.getIndexReader();
        Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(Indexer.INDEX_CONTENT);
        if (terms == null) {
            return indexedTerms;
        }

        TermsEnum termEnum = terms.iterator(null);
        BytesRef byteRef = null;

        while ((byteRef = termEnum.next()) != null) {
            String term = byteRef.utf8ToString();
            int count = indexReader.docFreq(new Term(Indexer.INDEX_CONTENT, byteRef));
            indexedTerms.put(term, Integer.valueOf(count));
        }
        indexReader.close();
    } catch (Exception e) {
        logger.error("Failed to dump index", e);
    }
    return indexedTerms;
}

From source file:org.tallison.gramreaper.terms.DumpTerms.java

License:Apache License

private void dumpTopNField(LeafReader leafReader, String field) throws IOException {
    AbstractTokenTFDFPriorityQueue queue = config.sort.equals(DumpTermsConfig.SORT.DF)
            ? new TokenDFPriorityQueue(config.topN)
            : new TokenTFPriorityQueue(config.topN);
    Terms terms = leafReader.terms(field);
    if (terms == null) {
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (FieldInfo fieldInfo : leafReader.getFieldInfos()) {
            if (i++ > 0) {
                sb.append("\n");
            }/*www  . j  a v  a2 s.  c o  m*/
            sb.append(fieldInfo.name);

        }
        throw new RuntimeException("I can't find field \"" + field + "\".\n" + "I only see:\n" + sb.toString());
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    int docsWThisField = leafReader.getDocCount(field);
    while (bytesRef != null) {
        int df = termsEnum.docFreq();
        long tf = termsEnum.totalTermFreq();
        if (config.minDocFreq > -1 && df < config.minDocFreq) {
            bytesRef = termsEnum.next();
            continue;
        }
        if (config.minDocPercentage > -1.0d
                && (double) df / (double) docsWThisField < config.minDocPercentage) {
            bytesRef = termsEnum.next();
            continue;
        }

        if (queue.top() == null || queue.size() < config.topN
                || (config.sort.equals(DumpTermsConfig.SORT.DF) ? df >= queue.top().df : tf > queue.top().tf)) {
            String t = bytesRef.utf8ToString();
            if (!config.stopWords.contains(t) && !config.startWords.contains(t)) {

                queue.insertWithOverflow(new TokenDFTF(t, df, tf));
            }
        }
        bytesRef = termsEnum.next();
    }
    if (config.outputFile == null) {
        StringBuilder sb = new StringBuilder();
        for (TokenDFTF tp : queue.getArray()) {
            System.out.println(getRow(sb, tp));
        }
    } else if (Files.isDirectory(config.outputFile)) {
        writeTopN(config.outputFile.resolve(field), queue);
    } else {
        writeTopN(config.outputFile, queue);
    }
}