List of usage examples for org.apache.lucene.util.fst Util get
public static <T> T get(FST<T> fst, BytesRef input) throws IOException
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
/** * {@inheritDoc}/*www .ja va 2s .c o m*/ */ @Override public ClassificationResult<Boolean> assignClass(String text) throws IOException { if (textTerms == null) { throw new IOException("You must first call Classifier#train"); } Long output = 0l; try (TokenStream tokenStream = analyzer.tokenStream(textFieldName, text)) { CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String s = charTermAttribute.toString(); Long d = Util.get(fst, new BytesRef(s)); if (d != null) { output += d; } } tokenStream.end(); } return new ClassificationResult<>(output >= threshold, output.doubleValue()); }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException { TermsEnum cte = textTerms.iterator(); // get the doc term vectors Terms terms = leafReader.getTermVector(docId, textFieldName); if (terms == null) { throw new IOException("term vectors must be stored for field " + textFieldName); }//www . j a v a 2s . c om TermsEnum termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { cte.seekExact(term); if (assignedClass != null) { long termFreqLocal = termsEnum.totalTermFreq(); // update weights Long previousValue = Util.get(fst, term); String termString = term.utf8ToString(); weights.put(termString, previousValue + modifier * termFreqLocal); } } if (updateFST) { updateFST(weights); } }
From source file:com.rocana.lucene.codec.v1.RocanaSegmentTermsEnum.java
License:Apache License
@SuppressWarnings("unused") private void printSeekState(PrintStream out) throws IOException { if (currentFrame == staticFrame) { out.println(" no prior seek"); } else {//from www .jav a2s .c o m out.println(" prior seek state:"); int ord = 0; boolean isSeekFrame = true; while (true) { RocanaSegmentTermsEnumFrame f = getFrame(ord); assert f != null; final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix); if (f.nextEnt == -1) { out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << RocanaBlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } else { out.println(" frame " + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + " nextEnt=" + f.nextEnt + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp << RocanaBlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) + (f.isFloor ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0)) + " lastSubFP=" + f.lastSubFP + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); } if (fr.index != null) { assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix - 1) & 0xFF)) { out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix - 1) & 0xFF)); throw new RuntimeException("seek state is broken"); } BytesRef output = Util.get(fr.index, prefix); if (output == null) { out.println(" broken seek state: prefix is not final in index"); throw new RuntimeException("seek state is broken"); } else if (isSeekFrame && !f.isFloor) { final ByteArrayDataInput reader = new ByteArrayDataInput(output.bytes, output.offset, output.length); final long codeOrig = reader.readVLong(); final long code = (f.fp << RocanaBlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) | (f.hasTerms ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS : 0) | (f.isFloor ? RocanaBlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR : 0); if (codeOrig != code) { out.println(" broken seek state: output code=" + codeOrig + " doesn't match frame code=" + code); throw new RuntimeException("seek state is broken"); } } } if (f == currentFrame) { break; } if (f.prefix == validIndexPrefix) { isSeekFrame = false; } ord++; } } }
From source file:examples.fst.FstTest.java
public static void main(String[] args) throws IOException { // Input values (keys). These must be provided to Builder in Unicode sorted order! String inputValues[] = { "cat", "dog", "dogs" }; long outputValues[] = { 5, 7, 12 }; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); BytesRefBuilder scratchBytes = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (int i = 0; i < inputValues.length; i++) { scratchBytes.copyChars(inputValues[i]); builder.add(Util.toIntsRef(scratchBytes.toBytesRef(), scratchInts), outputValues[i]); }// ww w. jav a 2 s . c om FST<Long> fst = builder.finish(); Long value = Util.get(fst, new BytesRef("dog")); System.out.println(value); // 7 // Only works because outputs are also in sorted order IntsRef key = Util.getByOutput(fst, 12); System.out.println(Util.toBytesRef(key, scratchBytes).utf8ToString()); // dogs }