Example usage for org.apache.lucene.util.fst Util get

List of usage examples for org.apache.lucene.util.fst Util get

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Util get.

Prototype

public static <T> T get(FST<T> fst, BytesRef input) throws IOException 

Source Link

Document

Looks up the output for this input, or null if the input is not accepted

Usage

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

/**
 * {@inheritDoc}/*www .ja  va  2s  .c o  m*/
 */
@Override
public ClassificationResult<Boolean> assignClass(String text) throws IOException {
    if (textTerms == null) {
        throw new IOException("You must first call Classifier#train");
    }
    Long output = 0l;
    try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) {
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String s = charTermAttribute.toString();
            Long d = Util.get(fst, new BytesRef(s));
            if (d != null) {
                output += d;
            }
        }
        tokenStream.end();
    }

    return new ClassificationResult<>(output >= threshold, output.doubleValue());
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass,
        SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException {
    TermsEnum cte = textTerms.iterator();

    // get the doc term vectors
    Terms terms = leafReader.getTermVector(docId, textFieldName);

    if (terms == null) {
        throw new IOException("term vectors must be stored for field " + textFieldName);
    }//www . j a v a 2s  .  c  om

    TermsEnum termsEnum = terms.iterator();

    BytesRef term;

    while ((term = termsEnum.next()) != null) {
        cte.seekExact(term);
        if (assignedClass != null) {
            long termFreqLocal = termsEnum.totalTermFreq();
            // update weights
            Long previousValue = Util.get(fst, term);
            String termString = term.utf8ToString();
            weights.put(termString, previousValue + modifier * termFreqLocal);
        }
    }
    if (updateFST) {
        updateFST(weights);
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaSegmentTermsEnum.java

License:Apache License

@SuppressWarnings("unused")
private void printSeekState(PrintStream out) throws IOException {
    if (currentFrame == staticFrame) {
        out.println("  no prior seek");
    } else {//from   www  .jav a2s  .c o  m
        out.println("  prior seek state:");
        int ord = 0;
        boolean isSeekFrame = true;
        while (true) {
            RocanaSegmentTermsEnumFrame f = getFrame(ord);
            assert f != null;
            final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
            if (f.nextEnt == -1) {
                out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp
                        + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix
                        + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")"))
                        + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code="
                        + ((f.fp << RocanaBlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
                                + (f.hasTerms ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
                                + (f.isFloor ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0))
                        + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd="
                        + f.getTermBlockOrd());
            } else {
                out.println("    frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord
                        + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen="
                        + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt
                        + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms
                        + " isFloor=" + f.isFloor + " code="
                        + ((f.fp << RocanaBlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
                                + (f.hasTerms ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
                                + (f.isFloor ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0))
                        + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto="
                        + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
            }
            if (fr.index != null) {
                assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc;
                if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) {
                    out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte="
                            + (char) (term.byteAt(f.prefix - 1) & 0xFF));
                    throw new RuntimeException("seek state is broken");
                }
                BytesRef output = Util.get(fr.index, prefix);
                if (output == null) {
                    out.println("      broken seek state: prefix is not final in index");
                    throw new RuntimeException("seek state is broken");
                } else if (isSeekFrame && !f.isFloor) {
                    final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset,
                            output.length);
                    final long codeOrig = reader.readVLong();
                    final long code = (f.fp << RocanaBlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS)
                            | (f.hasTerms ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0)
                            | (f.isFloor ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0);
                    if (codeOrig != code) {
                        out.println("      broken seek state: output code=" + codeOrig
                                + " doesn't match frame code=" + code);
                        throw new RuntimeException("seek state is broken");
                    }
                }
            }
            if (f == currentFrame) {
                break;
            }
            if (f.prefix == validIndexPrefix) {
                isSeekFrame = false;
            }
            ord++;
        }
    }
}

From source file:examples.fst.FstTest.java

public static void main(String[] args) throws IOException {
    // Input values (keys). These must be provided to Builder in Unicode sorted order!
    String inputValues[] = { "cat", "dog", "dogs" };
    long outputValues[] = { 5, 7, 12 };

    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (int i = 0; i < inputValues.length; i++) {
        scratchBytes.copyChars(inputValues[i]);
        builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]);
    }//  ww  w.  jav a  2 s .  c om
    FST<Long> fst = builder.finish();

    Long value = Util.get(fst, new BytesRef("dog"));
    System.out.println(value); // 7

    // Only works because outputs are also in sorted order
    IntsRef key = Util.getByOutput(fst, 12);
    System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs

}