List of usage examples for org.apache.lucene.analysis.tokenattributes PayloadAttribute getPayload
public BytesRef getPayload();
From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); }// w w w . ja va 2s . c om Payload pl = payload.getPayload(); if (pl != null) { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.getData()) + "] "); } else { System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java
License:Apache License
@Test public void baseUIMAPayloadsAnalyzerStreamTest() { try {// w w w.j a va 2s . co m TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class); while (ts.incrementToken()) { assertNotNull(termAtt); assertNotNull(payloadAttribute); System.out.println("token '" + termAtt.toString() + "' has payload " + new String(payloadAttribute.getPayload().getData())); } } catch (Exception e) { e.printStackTrace(); fail(e.getLocalizedMessage()); } }
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);/*from w ww.j av a 2s. com*/ System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java
License:Apache License
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException { ts.reset();/*from w ww . ja va 2 s. c om*/ final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); final ByteArrayDataInput in = new ByteArrayDataInput(); int pos = -1; for (final TokenInfo info : infos) { assertThat(ts.incrementToken()).isTrue(); pos += posIncrAtt.getPositionIncrement(); int len = -1; final BytesRef payload = payloadAtt.getPayload(); if (info.len != -1) { assertThat(payload).isNotNull(); in.reset(payload.bytes); len = in.readVInt(); } else { assertThat(payload).isNull(); } assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info); } assertThat(ts.incrementToken()).isFalse(); }
From source file:com.tuplejump.stargate.lucene.json.dewey.DeweyTokenizer.java
License:Apache License
public static void main(String[] args) throws IOException { String json = "{\n" + " \"id\": 0,\n" + " \"guid\": \"8416dc9e-6904-4787-93eb-b8038f543a04\",\n" + " \"isActive\": false,\n" + " \"balance\": \"$2,858.04\",\n" + " \"picture\": \"http://placehold.it/32x32\",\n" + " \"age\": 25,\n" + " \"eyeColor\": \"brown\",\n" + " \"name\": \"Audra Lynn\",\n" + " \"gender\": \"female\",\n" + " \"company\": \"PROSELY\",\n" + " \"email\": \"audralynn@prosely.com\",\n" + " \"phone\": \"+1 (911) 417-3322\",\n" + " \"address\": \"602 Blake Avenue, Madaket, Maine, 2123\",\n" + " \"about\": \"Enim dolor aliquip est voluptate sit nostrud ut. Dolore sint excepteur nulla et consequat velit cillum veniam quis ex. Consectetur reprehenderit magna minim excepteur magna laboris est sunt.\\r\\n\",\n" + " \"registered\": \"2014-02-05T16:55:14 -06:-30\",\n" + " \"latitude\": 55.275051,\n" + " \"longitude\": 139.3922,\n" + " \"tags\": [\n" + " \"sit\",\n" + " \"laboris\",\n" + " \"do\",\n" + " \"ad\",\n" + " \"et\",\n" + " \"reprehenderit\",\n" + " \"aliqua\"\n" + " ],\n" + " \"friends\": [\n" + " {\n" + " \"id\": 0,\n" + " \"name\": \"Fry Richmond\"\n" + " },\n" + " {\n" + " \"id\": 1,\n" + " \"name\": \"Olson Knight\"\n" + " },\n" + " {\n" + " \"id\": 2,\n" + " \"name\": \"Jimenez Dominguez\"\n" + " }\n" + " ],\n" + " \"greeting\": \"Hello, Audra Lynn! You have 9 unread messages.\",\n" + " \"favoriteFruit\": \"banana\"\n" + " }"; DeweyTokenizer tokenizer = new DeweyTokenizer(new StringReader(json)); JsonTypeFilter payloadFilter = new JsonTypeFilter(Version.LUCENE_48, tokenizer); DeweyFieldTokenizer fieldTokenizer = new DeweyFieldTokenizer(Version.LUCENE_48, payloadFilter); CharTermAttribute charTermAtt = fieldTokenizer.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = fieldTokenizer.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = fieldTokenizer.getAttribute(PayloadAttribute.class); fieldTokenizer.reset();//from www . j a va 2 s.co m while (fieldTokenizer.incrementToken()) { System.out.print(charTermAtt.toString() + "\t"); System.out.print(typeAtt.type() + "\t"); BytesRef payload = payloadAtt.getPayload(); if (payload != null) System.out.print(DeweyTokenizer.decodeLevel(payload.bytes) + "\n"); } }
From source file:dependencies.ReviewDependencyAnalyzer.java
License:Open Source License
public ArrayList<ArrayList<Token>> getSentences(Reader reader) { try {//from ww w .java 2 s. c o m // Send reader data through the analyzer TokenStream tokstr = reusableTokenStream("", reader); TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class); TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class); FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class); PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class); // Split the tokenstream returned by the analyzer into sentences. Convert each sentence // into a linked list of tokens ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>(); ArrayList<Token> current_sentence = new ArrayList<Token>(); while (tokstr.incrementToken()) { Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(), new ReviewTermPayload(tok_payload.getPayload())); current_sentence.add(current_token); // End of sentence reached. Add current sentence to the sentence list if (current_token.isDelim(true)) { if (current_sentence.size() > 1) { sentence_list.add(current_sentence); } current_sentence = new ArrayList<Token>(); } } // At the end of the token stream, if there is an incomplete sentence, add it to the // sentence list. // This case could occur when the last sentence of a given passage does not end with a // period or other sentence delimiter. if (!current_sentence.isEmpty()) { sentence_list.add(current_sentence); } return sentence_list; } catch (IOException e) { AppLogger.error.log(Level.SEVERE, "Error reading data from reader. Analyzing text for typed dependencies could not be completed"); return null; } }
From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java
License:Apache License
public void test() throws IOException { String test = "The quick red fox jumped over the lazy brown dogs"; TokenTypeSplitFilter ttsf = new TokenTypeSplitFilter(new Blah(whitespaceMockTokenizer(test)), Collections.singleton("even"), Collections.EMPTY_SET, "even_fork", "even_orig"); TokenTypeSplitFilter ttsfOdd = new TokenTypeSplitFilter(ttsf, Collections.singleton("odd"), Collections.EMPTY_SET, "odd_fork", "odd_orig"); TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(ttsfOdd, new String[] { "even_orig", "even_fork" }, "joined", null, "!", false, true); int count = 0; TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class); OffsetAttribute offsetAtt = ttjf.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = ttjf.getAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class); String lastTerm = null;/*from ww w . j a va 2 s . c om*/ int lastStartOffset = -1; int lastEndOffset = -1; ttjf.reset(); while (ttjf.incrementToken()) { String term = termAtt.toString(); String type = typeAtt.type(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); int posIncr = posIncrAtt.getPositionIncrement(); BytesRef payload = payloadAtt.getPayload(); switch (count % 3) { case 0: assertEquals("joined", type); assertEquals(1, posIncr); assertEquals(lastEndOffset + 1, startOffset); String[] split = term.split("!"); assertEquals(split[0], split[1]); assertNull(payload); break; case 1: assertEquals("odd_orig", type); assertEquals(1, posIncr); assertEquals(lastEndOffset + 1, startOffset); assertNull(payload); break; case 2: assertEquals("odd_fork", type); assertEquals(lastTerm, term); assertEquals(0, posIncr); assertEquals(lastStartOffset, startOffset); assertEquals(lastEndOffset, endOffset); assertNull(payload); break; } lastTerm = term; lastStartOffset = startOffset; lastEndOffset = endOffset; count++; } assertTrue(count + " does not equal: " + 15, count == 15); }
From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java
License:Apache License
public void testOutputComponentTypes() throws IOException { String test = "The quick red fox jumped over the lazy brown dogs"; TokenTypeSplitFilter ttsf = new TokenTypeSplitFilter(new Blah(whitespaceMockTokenizer(test)), Collections.singleton("even"), Collections.EMPTY_SET, "even_fork", "even_orig"); TokenTypeSplitFilter ttsfOdd = new TokenTypeSplitFilter(ttsf, Collections.singleton("odd"), Collections.EMPTY_SET, "odd_fork", "odd_orig"); TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(ttsfOdd, new String[] { "even_orig", "even_fork" }, "joined", null, "!", true, true); int count = 0; TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class); OffsetAttribute offsetAtt = ttjf.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = ttjf.getAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class); String lastTerm = null;// w ww . j a v a 2s .c om int lastStartOffset = -1; int lastEndOffset = -1; ttjf.reset(); while (ttjf.incrementToken()) { String term = termAtt.toString(); String type = typeAtt.type(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); int posIncr = posIncrAtt.getPositionIncrement(); BytesRef payload = payloadAtt.getPayload(); switch (count % 5) { case 0: assertEquals("even_orig", type); assertEquals(1, posIncr); assertEquals(lastEndOffset + 1, startOffset); assertNull(payload); break; case 1: assertEquals("even_fork", type); assertEquals(lastTerm, term); assertEquals(0, posIncr); assertEquals(lastStartOffset, startOffset); assertEquals(lastEndOffset, endOffset); assertNull(payload); break; case 2: assertEquals("joined", type); assertEquals(0, posIncr); assertEquals(lastStartOffset, startOffset); String[] split = term.split("!"); assertEquals(split[0], split[1]); assertNull(payload); break; case 3: assertEquals("odd_orig", type); assertEquals(1, posIncr); assertEquals(lastEndOffset + 1, startOffset); assertNull(payload); break; case 4: assertEquals("odd_fork", type); assertEquals(lastTerm, term); assertEquals(0, posIncr); assertEquals(lastStartOffset, startOffset); assertEquals(lastEndOffset, endOffset); assertNull(payload); break; } lastTerm = term; lastStartOffset = startOffset; lastEndOffset = endOffset; count++; } assertTrue(count + " does not equal: " + 25, count == 25); }
From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java
License:Apache License
public void testVariableTokenPresence() throws IOException { String test = "The Quick Red Fox Jumped Over The Lazy Brown Dogs"; TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(new Blah2(whitespaceMockTokenizer(test)), new String[] { "raw", "lower", "upper" }, "joined", null, "!", false, false); CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class); ttjf.reset();// w w w.j av a2s .c o m int i = -1; String[] split = test.split(" "); StringBuilder sb = new StringBuilder(); while (ttjf.incrementToken()) { String term = termAtt.toString(); BytesRef payload = payloadAtt.getPayload(); switch (++i) { case 0: assertEquals(split[i], term); assertNull(payload); break; case 1: sb.setLength(0); sb.append(split[i]).append('!').append(split[i].toUpperCase()); assertEquals(sb.toString(), term); assertNull(payload); break; case 2: sb.setLength(0); sb.append(split[i]).append('!').append(split[i].toLowerCase()).append('!') .append(split[i].toUpperCase()); assertEquals(sb.toString(), term); assertNull(payload); break; } } }
From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java
License:Apache License
/** verify that payload gets picked up for 1st group of tokens */ public void testTypeForPayload1() throws IOException { TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(new TokenArrayTokenizer(tokensWithPayloads), new String[] { "normalized", "filing", "prefix" }, "joined", "normalized", "!", false, false); CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class); ttjf.reset();/*from w ww .j a v a 2 s .c o m*/ assertTrue(ttjf.incrementToken()); assertEquals("unconsoled!Unconsoled!The ", termAtt.toString()); assertEquals("joined", typeAtt.type()); assertEquals("payload1", payloadAtt.getPayload().utf8ToString()); assertTrue(ttjf.incrementToken()); assertEquals("room with a view!Room With A View!A ", termAtt.toString()); assertEquals("joined", typeAtt.type()); assertNull(payloadAtt.getPayload()); assertFalse(ttjf.incrementToken()); }