List of usage examples for org.apache.lucene.util.fst Util toIntsRef
public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch)
From source file:BuildFST.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException {
boolean numeric = true;
boolean negative = false;
for (int i = 0; i < args.length; i++) {
int j = args[i].lastIndexOf('/');
if (j != -1) {
try {
negative |= Long.parseLong(args[i].substring(j + 1)) < 0;
} catch (NumberFormatException nfe) {
numeric = false;/*from ww w. j a v a 2s.com*/
break;
}
}
}
Outputs outputs;
if (numeric) {
if (negative) {
throw new RuntimeException("can only handle numeric outputs >= 0");
}
outputs = PositiveIntOutputs.getSingleton();
} else {
outputs = ByteSequenceOutputs.getSingleton();
}
Pair<?>[] inputs = new Pair[args.length];
for (int i = 0; i < args.length; i++) {
int j = args[i].lastIndexOf('/');
String input;
Object output;
if (j == -1) {
output = outputs.getNoOutput();
input = args[i];
} else {
input = args[i].substring(0, j);
String outputString = args[i].substring(j + 1);
if (numeric) {
output = Long.parseLong(outputString);
} else {
output = new BytesRef(outputString);
}
}
inputs[i] = new Pair(new BytesRef(input), output);
}
Arrays.sort(inputs);
FST<?> fst;
if (numeric) {
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
for (Pair pair : inputs) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(pair.input, intsBuilder);
b.add(intsBuilder.get(), (Long) pair.output);
}
fst = b.finish();
} else {
Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
for (Pair pair : inputs) {
IntsRefBuilder intsBuilder = new IntsRefBuilder();
Util.toIntsRef(pair.input, intsBuilder);
b.add(intsBuilder.get(), (BytesRef) pair.output);
}
fst = b.finish();
}
Util.toDot(fst, new PrintWriter(System.out), true, true);
}
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateFST(SortedMap<String, Double> weights) throws IOException { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String, Double> entry : weights.entrySet()) { scratchBytes.copyChars(entry.getKey()); fstBuilder.add(Util.toIntsRef(scratchBytes.get(), scratchInts), entry.getValue().longValue()); }//from ww w . j ava 2s . com fst = fstBuilder.finish(); }
From source file:examples.fst.FstTest.java
public static void main(String[] args) throws IOException { // Input values (keys). These must be provided to Builder in Unicode sorted order! String inputValues[] = { "cat", "dog", "dogs" }; long outputValues[] = { 5, 7, 12 }; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (int i = 0; i < inputValues.length; i++) { scratchBytes.copyChars(inputValues[i]); builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]); }/*from ww w. j av a 2 s . co m*/ FST<Long> fst = builder.finish(); Long value = Util.get(fst, new BytesRef("dog")); System.out.println(value); // 7 // Only works because outputs are also in sorted order IntsRef key = Util.getByOutput(fst, 12); System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs }
From source file:org.elasticsearch.index.fielddata.plain.FSTBytesIndexFieldData.java
License:Apache License
@Override public FSTBytesAtomicFieldData loadDirect(AtomicReaderContext context) throws Exception { AtomicReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); FSTBytesAtomicFieldData data = null; // TODO: Use an actual estimator to estimate before loading. NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker()); if (terms == null) { data = FSTBytesAtomicFieldData.empty(reader.maxDoc()); estimator.afterLoad(null, data.getMemorySizeInBytes()); return data; }//from ww w. java2 s . co m PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<Long> fstBuilder = new org.apache.lucene.util.fst.Builder<Long>( INPUT_TYPE.BYTE1, outputs); final IntsRef scratch = new IntsRef(); final long numTerms; if (regex == null && frequency == null) { numTerms = terms.size(); } else { numTerms = -1; } final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat( "acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO); OrdinalsBuilder builder = new OrdinalsBuilder(numTerms, reader.maxDoc(), acceptableTransientOverheadRatio); boolean success = false; try { // we don't store an ord 0 in the FST since we could have an empty string in there and FST don't support // empty strings twice. ie. them merge fails for long output. TermsEnum termsEnum = filter(terms, reader); DocsEnum docsEnum = null; for (BytesRef term = termsEnum.next(); term != null; term = termsEnum.next()) { final long termOrd = builder.nextOrdinal(); assert termOrd > 0; fstBuilder.add(Util.toIntsRef(term, scratch), (long) termOrd); docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE); for (int docId = docsEnum.nextDoc(); docId != DocsEnum.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { builder.addDoc(docId); } } FST<Long> fst = fstBuilder.finish(); final Ordinals ordinals = builder.build(fieldDataType.getSettings()); data = new FSTBytesAtomicFieldData(fst, ordinals); success = true; return data; } finally { if (success) { estimator.afterLoad(null, data.getMemorySizeInBytes()); } builder.close(); } }
From source file:org.meresco.lucene.numerate.FSTdict.java
License:Open Source License
public void put(String uri, int ord) throws IOException { scratch.clear();/*from ww w . j a v a 2 s. c om*/ this.fstbuilder.add(Util.toIntsRef(new BytesRef(uri), scratch), (long) ord); }