Example usage for org.apache.lucene.util.fst Util toIntsRef

List of usage examples for org.apache.lucene.util.fst Util toIntsRef

Introduction

In this page you can find the example usage for org.apache.lucene.util.fst Util toIntsRef.

Prototype

public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) 

Source Link

Document

Just takes unsigned byte values from the BytesRef and converts into an IntsRef.

Usage

From source file:BuildFST.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {

    boolean numeric = true;
    boolean negative = false;
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        if (j != -1) {
            try {
                negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
            } catch (NumberFormatException nfe) {
                numeric = false;/*from ww w.  j  a v a  2s.com*/
                break;
            }
        }
    }

    Outputs outputs;
    if (numeric) {
        if (negative) {
            throw new RuntimeException("can only handle numeric outputs >= 0");
        }
        outputs = PositiveIntOutputs.getSingleton();
    } else {
        outputs = ByteSequenceOutputs.getSingleton();
    }

    Pair<?>[] inputs = new Pair[args.length];
    for (int i = 0; i < args.length; i++) {
        int j = args[i].lastIndexOf('/');
        String input;
        Object output;
        if (j == -1) {
            output = outputs.getNoOutput();
            input = args[i];
        } else {
            input = args[i].substring(0, j);
            String outputString = args[i].substring(j + 1);
            if (numeric) {
                output = Long.parseLong(outputString);
            } else {
                output = new BytesRef(outputString);
            }
        }
        inputs[i] = new Pair(new BytesRef(input), output);
    }
    Arrays.sort(inputs);

    FST<?> fst;
    if (numeric) {
        Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (Long) pair.output);
        }
        fst = b.finish();
    } else {
        Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
        for (Pair pair : inputs) {
            IntsRefBuilder intsBuilder = new IntsRefBuilder();
            Util.toIntsRef(pair.input, intsBuilder);
            b.add(intsBuilder.get(), (BytesRef) pair.output);
        }
        fst = b.finish();
    }
    Util.toDot(fst, new PrintWriter(System.out), true, true);
}

From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java

License:Apache License

private void updateFST(SortedMap<String, Double> weights) throws IOException {
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (Map.Entry<String, Double> entry : weights.entrySet()) {
        scratchBytes.copyChars(entry.getKey());
        fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue());
    }//from  ww  w .  j ava 2s  .  com
    fst = fstBuilder.finish();
}

From source file:examples.fst.FstTest.java

public static void main(String[] args) throws IOException {
    // Input values (keys). These must be provided to Builder in Unicode sorted order!
    String inputValues[] = { "cat", "dog", "dogs" };
    long outputValues[] = { 5, 7, 12 };

    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
    BytesRefBuilder scratchBytes = new BytesRefBuilder();
    IntsRefBuilder scratchInts = new IntsRefBuilder();
    for (int i = 0; i < inputValues.length; i++) {
        scratchBytes.copyChars(inputValues[i]);
        builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]);
    }/*from   ww  w. j av  a 2 s . co  m*/
    FST<Long> fst = builder.finish();

    Long value = Util.get(fst, new BytesRef("dog"));
    System.out.println(value); // 7

    // Only works because outputs are also in sorted order
    IntsRef key = Util.getByOutput(fst, 12);
    System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs

}

From source file:org.elasticsearch.index.fielddata.plain.FSTBytesIndexFieldData.java

License:Apache License

@Override
public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception {
    AtomicReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    FSTBytesAtomicFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker());
    if (terms == null) {
        data = FSTBytesAtomicFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.getMemorySizeInBytes());
        return data;
    }//from ww  w. java2 s  . co  m
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>(
            INPUT_TYPE.BYTE1, outputs);
    final IntsRef scratch = new IntsRef();

    final long numTerms;
    if (regex == null && frequency == null) {
        numTerms = terms.size();
    } else {
        numTerms = -1;
    }
    final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat(
            "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
    OrdinalsBuilder builder = new OrdinalsBuilder(numTerms, reader.maxDoc(), acceptableTransientOverheadRatio);
    boolean success = false;
    try {

        // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support
        // empty strings twice. ie. them merge fails for long output.
        TermsEnum termsEnum = filter(terms, reader);
        DocsEnum docsEnum = null;
        for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) {
            final long termOrd = builder.nextOrdinal();
            assert termOrd > 0;
            fstBuilder.add(Util.toIntsRef(term, scratch), (long) termOrd);
            docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE);
            for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                builder.addDoc(docId);
            }
        }

        FST<Long> fst = fstBuilder.finish();

        final Ordinals ordinals = builder.build(fieldDataType.getSettings());

        data = new FSTBytesAtomicFieldData(fst, ordinals);
        success = true;
        return data;
    } finally {
        if (success) {
            estimator.afterLoad(null, data.getMemorySizeInBytes());
        }
        builder.close();
    }
}

From source file:org.meresco.lucene.numerate.FSTdict.java

License:Open Source License

public void put(String uri, int ord) throws IOException {
    scratch.clear();/*from  ww  w  .  j a v  a  2  s. c om*/
    this.fstbuilder.add(Util.toIntsRef(new BytesRef(uri), scratch), (long) ord);
}