List of usage examples for org.apache.lucene.analysis.tokenattributes PositionIncrementAttribute setPositionIncrement
public void setPositionIncrement(int positionIncrement);
From source file:analysis.AnalyzerUtils.java
License:Apache License
public static void setPositionIncrement(AttributeSource source, int posIncr) { PositionIncrementAttribute attr = source.addAttribute(PositionIncrementAttribute.class); attr.setPositionIncrement(posIncr); }
From source file:com.rubenlaguna.en4j.searchlucene.AnalyzerUtils.java
License:Open Source License
public static void setPositionIncrement(AttributeSource source, int posIncr) { PositionIncrementAttribute attr = (PositionIncrementAttribute) source .addAttribute(PositionIncrementAttribute.class); attr.setPositionIncrement(posIncr); }
From source file:com.underthehood.weblogs.lucene.AutoPhrasingTokenFilter.java
License:Apache License
private void emit(char[] token) { //System.out.println("emit: " + new String(token)); if (replaceWhitespaceWith != null) { token = replaceWhiteSpace(token); }/*from www. j a v a2 s . co m*/ CharTermAttribute termAttr = getTermAttribute(); termAttr.setEmpty(); termAttr.append(new StringBuilder().append(token)); OffsetAttribute offAttr = getOffsetAttribute(); if (offAttr != null && offAttr.endOffset() >= token.length) { int start = offAttr.endOffset() - token.length; offAttr.setOffset(start, offAttr.endOffset()); } PositionIncrementAttribute pia = getPositionIncrementAttribute(); if (pia != null) { pia.setPositionIncrement(++positionIncr); } lastEmitted = token; }
From source file:nl.inl.blacklab.filter.AbstractSynonymFilter.java
License:Apache License
static void setPositionIncrement(AttributeSource source, int posIncr) { PositionIncrementAttribute attr = source.addAttribute(PositionIncrementAttribute.class); attr.setPositionIncrement(posIncr); }
From source file:nl.inl.blacklab.filter.StubTokenStream.java
License:Apache License
public StubTokenStream(String[] terms) { this.terms = terms; ta = addAttribute(CharTermAttribute.class); PositionIncrementAttribute pa = addAttribute(PositionIncrementAttribute.class); pa.setPositionIncrement(1); }
From source file:org.apache.solr.analysis.SlowSynonymFilter.java
License:Apache License
@Override public boolean incrementToken() throws IOException { while (true) { // if there are any generated tokens, return them... don't try any // matches against them, as we specifically don't want recursion. if (replacement != null && replacement.hasNext()) { copy(this, replacement.next()); return true; }/*ww w . jav a2s . co m*/ // common case fast-path of first token not matching anything AttributeSource firstTok = nextTok(); if (firstTok == null) return false; CharTermAttribute termAtt = firstTok.addAttribute(CharTermAttribute.class); SlowSynonymMap result = map.submap != null ? map.submap.get(termAtt.buffer(), 0, termAtt.length()) : null; if (result == null) { copy(this, firstTok); return true; } // fast-path failed, clone ourselves if needed if (firstTok == this) firstTok = cloneAttributes(); // OK, we matched a token, so find the longest match. matched = new LinkedList<AttributeSource>(); result = match(result); if (result == null) { // no match, simply return the first token read. copy(this, firstTok); return true; } // reuse, or create new one each time? ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>( result.synonyms.length + matched.size() + 1); // // there was a match... let's generate the new tokens, merging // in the matched tokens (position increments need adjusting) // AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast(); boolean includeOrig = result.includeOrig(); AttributeSource origTok = includeOrig ? firstTok : null; PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(PositionIncrementAttribute.class); int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the original stream int repPos = 0; // curr position in replacement token stream int pos = 0; // current position in merged token stream for (int i = 0; i < result.synonyms.length; i++) { Token repTok = result.synonyms[i]; AttributeSource newTok = firstTok.cloneAttributes(); CharTermAttribute newTermAtt = newTok.addAttribute(CharTermAttribute.class); OffsetAttribute newOffsetAtt = newTok.addAttribute(OffsetAttribute.class); PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(PositionIncrementAttribute.class); OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class); newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset()); newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length()); repPos += repTok.getPositionIncrement(); if (i == 0) repPos = origPos; // make position of first token equal to original // if necessary, insert original tokens and adjust position increment while (origTok != null && origPos <= repPos) { PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPosInc.setPositionIncrement(origPos - pos); generated.add(origTok); pos += origPosInc.getPositionIncrement(); origTok = matched.isEmpty() ? null : matched.removeFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPos += origPosInc.getPositionIncrement(); } } newPosIncAtt.setPositionIncrement(repPos - pos); generated.add(newTok); pos += newPosIncAtt.getPositionIncrement(); } // finish up any leftover original tokens while (origTok != null) { PositionIncrementAttribute origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPosInc.setPositionIncrement(origPos - pos); generated.add(origTok); pos += origPosInc.getPositionIncrement(); origTok = matched.isEmpty() ? null : matched.removeFirst(); if (origTok != null) { origPosInc = origTok.addAttribute(PositionIncrementAttribute.class); origPos += origPosInc.getPositionIncrement(); } } // what if we replaced a longer sequence with a shorter one? // a/0 b/5 => foo/0 // should I re-create the gap on the next buffered token? replacement = generated.iterator(); // Now return to the top of the loop to read and return the first // generated token.. The reason this is done is that we may have generated // nothing at all, and may need to continue with more matching logic. } }
From source file:org.apache.solr.schema.JsonPreAnalyzedParser.java
License:Apache License
@SuppressWarnings("unchecked") @Override// w w w . j a va 2 s . c o m public ParseResult parse(Reader reader, AttributeSource parent) throws IOException { ParseResult res = new ParseResult(); StringBuilder sb = new StringBuilder(); char[] buf = new char[128]; int cnt; while ((cnt = reader.read(buf)) > 0) { sb.append(buf, 0, cnt); } String val = sb.toString(); // empty string - accept even without version number if (val.length() == 0) { return res; } Object o = ObjectBuilder.fromJSON(val); if (!(o instanceof Map)) { throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map"); } Map<String, Object> map = (Map<String, Object>) o; // check version String version = (String) map.get(VERSION_KEY); if (version == null) { throw new IOException("Missing VERSION key"); } if (!VERSION.equals(version)) { throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION); } if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) { throw new IOException("Field cannot have both stringValue and binaryValue"); } res.str = (String) map.get(STRING_KEY); String bin = (String) map.get(BINARY_KEY); if (bin != null) { byte[] data = Base64.base64ToByteArray(bin); res.bin = data; } List<Object> tokens = (List<Object>) map.get(TOKENS_KEY); if (tokens == null) { return res; } int tokenStart = 0; int tokenEnd = 0; parent.clearAttributes(); for (Object ot : tokens) { tokenStart = tokenEnd + 1; // automatic increment by 1 separator Map<String, Object> tok = (Map<String, Object>) ot; boolean hasOffsetStart = false; boolean hasOffsetEnd = false; int len = -1; for (Entry<String, Object> e : tok.entrySet()) { String key = e.getKey(); if (key.equals(TOKEN_KEY)) { CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class); String str = String.valueOf(e.getValue()); catt.append(str); len = str.length(); } else if (key.equals(OFFSET_START_KEY)) { Object obj = e.getValue(); hasOffsetStart = true; if (obj instanceof Number) { tokenStart = ((Number) obj).intValue(); } else { try { tokenStart = Integer.parseInt(String.valueOf(obj)); } catch (NumberFormatException nfe) { LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'"); hasOffsetStart = false; } } } else if (key.equals(OFFSET_END_KEY)) { hasOffsetEnd = true; Object obj = e.getValue(); if (obj instanceof Number) { tokenEnd = ((Number) obj).intValue(); } else { try { tokenEnd = Integer.parseInt(String.valueOf(obj)); } catch (NumberFormatException nfe) { LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'"); hasOffsetEnd = false; } } } else if (key.equals(POSINCR_KEY)) { Object obj = e.getValue(); int posIncr = 1; if (obj instanceof Number) { posIncr = ((Number) obj).intValue(); } else { try { posIncr = Integer.parseInt(String.valueOf(obj)); } catch (NumberFormatException nfe) { LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'"); } } PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class); patt.setPositionIncrement(posIncr); } else if (key.equals(PAYLOAD_KEY)) { String str = String.valueOf(e.getValue()); if (str.length() > 0) { byte[] data = Base64.base64ToByteArray(str); PayloadAttribute p = parent.addAttribute(PayloadAttribute.class); if (data != null && data.length > 0) { p.setPayload(new BytesRef(data)); } } } else if (key.equals(FLAGS_KEY)) { try { int f = Integer.parseInt(String.valueOf(e.getValue()), 16); FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class); flags.setFlags(f); } catch (NumberFormatException nfe) { LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'"); } } else if (key.equals(TYPE_KEY)) { TypeAttribute tattr = parent.addAttribute(TypeAttribute.class); tattr.setType(String.valueOf(e.getValue())); } else { LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue()); } } // handle offset attr OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class); if (!hasOffsetEnd && len > -1) { tokenEnd = tokenStart + len; } offset.setOffset(tokenStart, tokenEnd); if (!hasOffsetStart) { tokenStart = tokenEnd + 1; } // capture state and add to result State state = parent.captureState(); res.states.add(state.clone()); // reset for reuse parent.clearAttributes(); } return res; }
From source file:org.apache.solr.schema.SimplePreAnalyzedParser.java
License:Apache License
private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) { a.clearAttributes();/* ww w. j ava2 s .c o m*/ CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class); char[] tokChars = state.token.toString().toCharArray(); termAtt.copyBuffer(tokChars, 0, tokChars.length); int tokenStart = tokenEnd - state.token.length(); for (Entry<String, String> e : state.attr.entrySet()) { String k = e.getKey(); if (k.equals("i")) { // position increment int incr = Integer.parseInt(e.getValue()); PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class); posIncr.setPositionIncrement(incr); } else if (k.equals("s")) { tokenStart = Integer.parseInt(e.getValue()); } else if (k.equals("e")) { tokenEnd = Integer.parseInt(e.getValue()); } else if (k.equals("y")) { TypeAttribute type = a.addAttribute(TypeAttribute.class); type.setType(e.getValue()); } else if (k.equals("f")) { FlagsAttribute flags = a.addAttribute(FlagsAttribute.class); int f = Integer.parseInt(e.getValue(), 16); flags.setFlags(f); } else if (k.equals("p")) { PayloadAttribute p = a.addAttribute(PayloadAttribute.class); byte[] data = hexToBytes(e.getValue()); if (data != null && data.length > 0) { p.setPayload(new BytesRef(data)); } } else { // unknown attribute } } // handle offset attr OffsetAttribute offset = a.addAttribute(OffsetAttribute.class); offset.setOffset(tokenStart, tokenEnd); State resState = a.captureState(); a.clearAttributes(); return resState; }
From source file:org.gridkit.coherence.search.lucene.CapturedTokenStream.java
License:Apache License
public void append(TokenStream ts, int positionGap, int offsetShift) throws IOException { PositionIncrementAttribute pi = null; pi = ts.getAttribute(PositionIncrementAttribute.class); OffsetAttribute off = null;/*from w w w . ja va 2s . co m*/ if (offsetShift != 0) { off = ts.getAttribute(OffsetAttribute.class); } ts.reset(); while (ts.incrementToken()) { if (positionGap != 0) { pi.setPositionIncrement(positionGap); positionGap = 0; } if (off != null) { off.setOffset(offsetShift + off.startOffset(), offsetShift + off.endOffset()); } tokens.add(ts.captureState()); lastPos += pi.getPositionIncrement(); } }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java
License:Mozilla Public License
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException { assertNotNull(output);//from w w w .j ava 2 s. c o m CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); CharTermAttribute termAtt = null; if (output.length > 0) { assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); termAtt = ts.getAttribute(CharTermAttribute.class); } OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } PositionLengthAttribute posLengthAtt = null; if (posLengths != null) { assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); } KeywordAttribute keywordAtt = null; if (keywordAtts != null) { assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class)); keywordAtt = ts.getAttribute(KeywordAttribute.class); } // Maps position to the start/end offset: final Map<Integer, Integer> posToStartOffset = new HashMap<>(); final Map<Integer, Integer> posToEndOffset = new HashMap<>(); ts.reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also // assign bogus values ts.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); if (keywordAtt != null) keywordAtt.setKeyword((i & 1) == 0); checkClearAtt.getAndResetClearCalled(); // reset it, because we // called clearAttribute() // before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.toString()); if (startOffsets != null) { assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); } if (endOffsets != null) { assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); } if (types != null) { assertEquals("type " + i, types[i], typeAtt.type()); } if (posIncrements != null) { assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } if (posLengths != null) { assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength()); } if (keywordAtts != null) { assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword()); } // we can enforce some basic things about a few attributes even if // the caller doesn't check: if (offsetAtt != null) { final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); if (finalOffset != null) { assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue()); assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(), endOffset <= finalOffset.intValue()); } if (offsetsAreCorrect) { assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset); lastStartOffset = offsetAtt.startOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: final int posInc = posIncrAtt.getPositionIncrement(); pos += posInc; final int posLength = posLengthAtt.getPositionLength(); if (!posToStartOffset.containsKey(pos)) { // First time we've seen a token leaving from this // position: posToStartOffset.put(pos, startOffset); // System.out.println(" + s " + pos + " -> " + // startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: // System.out.println(" + vs " + pos + " -> " + // startOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); } final int endPos = pos + posLength; if (!posToEndOffset.containsKey(endPos)) { // First time we've seen a token arriving to this // position: posToEndOffset.put(endPos, endOffset); // System.out.println(" + e " + endPos + " -> " + // endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: // System.out.println(" + ve " + endPos + " -> " + // endOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); } } } if (posIncrAtt != null) { if (i == 0) { assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1); } else { assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); } } if (posLengthAtt != null) { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); } } if (ts.incrementToken()) { fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString()); } // repeat our extra safety checks for end() ts.clearAttributes(); if (termAtt != null) termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); checkClearAtt.getAndResetClearCalled(); // reset it, because we called // clearAttribute() before ts.end(); assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled()); if (finalOffset != null) { assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset()); } if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } if (finalPosInc != null) { assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement()); } ts.close(); }