Example usage for org.apache.lucene.index Term bytes

List of usage examples for org.apache.lucene.index Term bytes

Introduction

In this page you can find the example usage for org.apache.lucene.index Term bytes.

Prototype

BytesRef bytes

To view the source code for org.apache.lucene.index Term bytes.

Click Source Link

Usage

From source file:SimpleNaiveBayesClassifier.java

License:Apache License

/**
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 *//*from  ww w  . j a  va2 s  . com*/
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument)
        throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

    Terms classes = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef next;
    String[] tokenizedText = tokenize(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = classesEnum.next()) != null) {
        if (next.length > 0) {
            Term term = new Term(this.classFieldName, next);
            double clVal = calculateLogPrior(term, docsWithClassSize)
                    + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
            assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
        }
    }

    // normalization; the values transforms to a 0-1 range
    return normClassificationResults(assignedClasses);
}

From source file:SimpleNaiveBayesDocumentClassifier.java

License:Apache License

private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException {
    List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();
    Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>();
    Map<String, Float> fieldName2boost = new LinkedHashMap<>();
    Terms classes = MultiFields.getTerms(leafReader, classFieldName);
    TermsEnum classesEnum = classes.iterator();
    BytesRef c;/*  ww  w.ja v a2  s .  c  om*/

    analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost);

    int docsWithClassSize = countDocsWithClass();
    while ((c = classesEnum.next()) != null) {
        double classScore = 0;
        Term term = new Term(this.classFieldName, c);
        for (String fieldName : textFieldNames) {
            List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName);
            double fieldScore = 0;
            for (String[] fieldTokensArray : tokensArrays) {
                fieldScore += calculateLogPrior(term, docsWithClassSize)
                        + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize)
                                * fieldName2boost.get(fieldName);
            }
            classScore += fieldScore;
        }
        assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore));
    }
    return normClassificationResults(assignedClasses);
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) {
    try {/* w  w  w .j av  a 2 s  .  c om*/
        Term term = new Term(VALUE_LABEL, blockingKey);
        List<Integer> entityIds = new ArrayList<>();
        int docFrequency = iReader.docFreq(term);
        if (0 < docFrequency) {
            BytesRef text = term.bytes();
            DocsEnum de = MultiFields.getTermDocsEnum(iReader, MultiFields.getLiveDocs(iReader), VALUE_LABEL,
                    text);
            int doc;
            while ((doc = de.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
                entityIds.add(docIds[doc]);
            }
        }

        return entityIds;
    } catch (IOException ex) {
        ex.printStackTrace();
        return null;
    }
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) {
    try {//from www.  jav  a 2  s.  c o  m
        Term term = new Term(VALUE_LABEL, blockingKey);
        List<Integer> entityIds = new ArrayList<>();
        int docFrequency = iReader.docFreq(term);
        if (0 < docFrequency) {
            BytesRef text = term.bytes();
            PostingsEnum pe = MultiFields.getTermDocsEnum(iReader, VALUE_LABEL, text);
            int doc;
            while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                entityIds.add(docIds[doc]);
            }
        }

        return entityIds;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:br.ufmt.periscope.indexer.resources.search.FastJoinTermEnum.java

public FastJoinTermEnum(TermsEnum tenum, Term name, FuzzyTokenSimilarity ts) {
    super(tenum);
    this.name = name.bytes();
    this.ts = ts;
    setInitialSeekTerm(this.name);
}

From source file:com.ifactory.press.db.solr.HitCount.java

License:Apache License

@Override
public ValueSource parse(FunctionQParser fp) throws SyntaxError {
    // hitcount() takes no arguments.  If we wanted to pass a query
    // we could call fp.parseNestedQuery()
    HashSet<String> fields = new HashSet<String>();
    while (fp.hasMoreArguments()) {
        fields.add(fp.parseArg());/*from  w w  w.ja va2  s.c  om*/
    }
    Query q = fp.subQuery(fp.getParams().get("q"), "lucene").getQuery();
    HashSet<Term> terms = new HashSet<Term>();
    try {
        q.extractTerms(terms);
    } catch (UnsupportedOperationException e) {
        return new DoubleConstValueSource(1);
    }
    ArrayList<ValueSource> termcounts = new ArrayList<ValueSource>();
    for (Term t : terms) {
        if (fields.isEmpty() || fields.contains(t.field())) {
            termcounts.add(new TermFreqValueSource(t.field(), t.text(), t.field(), t.bytes()));
        }
    }
    return new SumFloatFunction(termcounts.toArray(new ValueSource[termcounts.size()]));
}

From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java

License:Open Source License

public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) {
    try {//from w ww.ja  v  a  2s  . c  o  m
        this.indexSearcher = indexSearcher;
        this.jochreQuery = jochreQuery;
        query = rewrite(jochreQuery.getLuceneQuery());
        queryTerms = new TreeSet<Term>();
        query.extractTerms(queryTerms);
        if (LOG.isTraceEnabled())
            queryTermList = new ArrayList<Term>(queryTerms);

        final IndexReader reader = indexSearcher.getIndexReader();
        // add 1 to doc count to ensure even terms in all docs get a very small weight
        docCountLog = Math.log(reader.numDocs() + 1);

        IndexReaderContext readerContext = reader.getContext();
        leaves = readerContext.leaves();

        // since the same terms might be contained in the query multiple times (e.g. once per field)
        // we only consider them once each by using a HashSet
        terms = new HashSet<BytesRef>();
        Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>();
        for (Term term : queryTerms) {
            terms.add(term.bytes());
            termFreqs.put(term.bytes(), 0);
        }

        termLogs = new HashMap<BytesRef, Double>();
        for (Term term : queryTerms) {
            int freq = termFreqs.get(term.bytes());
            freq += reader.docFreq(term);
            termFreqs.put(term.bytes(), freq);
        }
        for (BytesRef term : terms) {
            int freq = termFreqs.get(term);
            termLogs.put(term, Math.log(freq));
        }
    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.lucene.MyPrefixQuery.java

License:Apache License

public MyPrefixQuery(Term prefix) {
    super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
    if (prefix == null) {
        throw new NullPointerException("prefix must not be null");
    }//from ww w. j  av a  2 s.c om
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

@Override

protected Suggest.Suggestion<Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>> innerExecute(
        String name, PrefixSuggestionContext suggestion, IndexReader indexReader, CharsRefBuilder spare)
        throws IOException {
    List<Term> analyzed = queryTerms(suggestion, spare);

    //If there is more than one parameter, return it. We can't do accurate phrase suggestions cheaply, so we don't.
    if (analyzed.size() != 1) {
        return null;
    }//w  ww .  j  a  va  2s .c  om

    Term text = analyzed.get(0);
    //E.g. "a" or "" (empty string) would list more or less all terms in the dictionary. Don't wanna do that.
    if (text.bytes().length < suggestion.getMinPrefixLength()) {
        return null;
    }
    PrefixTermsEnum prefixTermsEnum = terms(indexReader, text);
    //E.g. there is no field or no prefix like this
    if (prefixTermsEnum == null)
        return null;
    final int size = suggestion.getSize();
    return singleEntrySuggestion(name, size, getOptions(text, prefixTermsEnum, indexReader, size));
}

From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java

License:Apache License

private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text,
        PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException {
    OptionQueue collectionQueue = new OptionQueue(size);
    BytesRef ref;/*from  w w  w .jav  a 2  s.c o  m*/
    while ((ref = prefixTermsEnum.next()) != null) {
        Term term = new Term(text.field(), BytesRef.deepCopyOf(ref));
        collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option(
                new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term)));
    }

    Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>(
            new StringText(text.text()), 0, text.bytes().length);
    while (collectionQueue.size() > 0) {
        entry.addOption(collectionQueue.pop());
    }
    return entry;
}