List of usage examples for org.apache.lucene.analysis.tokenattributes PositionLengthAttribute getPositionLength
public int getPositionLength();
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = (CharTermAttribute) tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = (TypeAttribute) tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = (SemanticClassAttribute) tokenizer .addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = (PartOfSpeechAttribute) tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); }//from w w w. jav a 2 s . c o m tokenizer.end(); return result.toString(); }
From source file:com.qwazr.search.index.TermDefinition.java
License:Apache License
TermDefinition(CharTermAttribute charTermAttr, FlagsAttribute flagsAttr, OffsetAttribute offsetAttr, PositionIncrementAttribute posIncAttr, PositionLengthAttribute posLengthAttr, TypeAttribute typeAttr, KeywordAttribute keywordAttr) {/*from ww w. j a v a 2s . c om*/ char_term = charTermAttr == null ? null : charTermAttr.toString(); if (offsetAttr != null) { start_offset = offsetAttr.startOffset(); end_offset = offsetAttr.endOffset(); } else { start_offset = null; end_offset = null; } flags = flagsAttr == null ? null : flagsAttr.getFlags(); position_increment = posIncAttr == null ? null : posIncAttr.getPositionIncrement(); position_length = posLengthAttr == null ? null : posLengthAttr.getPositionLength(); type = typeAttr == null ? null : typeAttr.type(); is_keyword = keywordAttr == null ? null : keywordAttr.isKeyword(); }
From source file:com.shaie.SynonymFilterExample.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymGraphFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset();// w ww .j a v a 2 s .c om int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
From source file:org.bitbucket.eunjeon.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); tokenizer.reset();/*from ww w .ja v a2 s. c o m*/ while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); } tokenizer.end(); return result.toString(); }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java
License:Mozilla Public License
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException { assertNotNull(output);/*from ww w . j av a 2s . c o m*/ CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); CharTermAttribute termAtt = null; if (output.length > 0) { assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); termAtt = ts.getAttribute(CharTermAttribute.class); } OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } PositionLengthAttribute posLengthAtt = null; if (posLengths != null) { assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); } KeywordAttribute keywordAtt = null; if (keywordAtts != null) { assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class)); keywordAtt = ts.getAttribute(KeywordAttribute.class); } // Maps position to the start/end offset: final Map<Integer, Integer> posToStartOffset = new HashMap<>(); final Map<Integer, Integer> posToEndOffset = new HashMap<>(); ts.reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also // assign bogus values ts.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); if (keywordAtt != null) keywordAtt.setKeyword((i & 1) == 0); checkClearAtt.getAndResetClearCalled(); // reset it, because we // called clearAttribute() // before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.toString()); if (startOffsets != null) { assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); } if (endOffsets != null) { assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); } if (types != null) { assertEquals("type " + i, types[i], typeAtt.type()); } if (posIncrements != null) { assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } if (posLengths != null) { assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength()); } if (keywordAtts != null) { assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword()); } // we can enforce some basic things about a few attributes even if // the caller doesn't check: if (offsetAtt != null) { final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); if (finalOffset != null) { assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue()); assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(), endOffset <= finalOffset.intValue()); } if (offsetsAreCorrect) { assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset); lastStartOffset = offsetAtt.startOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: final int posInc = posIncrAtt.getPositionIncrement(); pos += posInc; final int posLength = posLengthAtt.getPositionLength(); if (!posToStartOffset.containsKey(pos)) { // First time we've seen a token leaving from this // position: posToStartOffset.put(pos, startOffset); // System.out.println(" + s " + pos + " -> " + // startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: // System.out.println(" + vs " + pos + " -> " + // startOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); } final int endPos = pos + posLength; if (!posToEndOffset.containsKey(endPos)) { // First time we've seen a token arriving to this // position: posToEndOffset.put(endPos, endOffset); // System.out.println(" + e " + endPos + " -> " + // endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: // System.out.println(" + ve " + endPos + " -> " + // endOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); } } } if (posIncrAtt != null) { if (i == 0) { assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1); } else { assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); } } if (posLengthAtt != null) { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); } } if (ts.incrementToken()) { fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString()); } // repeat our extra safety checks for end() ts.clearAttributes(); if (termAtt != null) termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); checkClearAtt.getAndResetClearCalled(); // reset it, because we called // clearAttribute() before ts.end(); assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled()); if (finalOffset != null) { assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset()); } if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } if (finalPosInc != null) { assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement()); } ts.close(); }