Example usage for org.apache.lucene.analysis.tokenattributes PayloadAttribute getPayload

List of usage examples for org.apache.lucene.analysis.tokenattributes PayloadAttribute getPayload

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.tokenattributes PayloadAttribute getPayload.

Prototype

public BytesRef getPayload();

Source Link

Document

Returns this Token's payload.

Usage

From source file:at.ac.univie.mminf.luceneSKOS.util.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ":");
        }//  w  w w . ja  va 2s  .  c  om

        Payload pl = payload.getPayload();

        if (pl != null) {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + ":" + new String(pl.getData()) + "] ");

        } else {
            System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset()
                    + ":" + type.type() + "] ");

        }

    }
    System.out.println();
}

From source file:com.github.le11.nls.lucene.UIMAPayloadsAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAPayloadsAnalyzerStreamTest() {
    try {// w w w.j a  va  2s  .  co m
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(termAtt);
            assertNotNull(payloadAttribute);
            System.out.println("token '" + termAtt.toString() + "' has payload "
                    + new String(payloadAttribute.getPayload().getData()));
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*from   w  ww.j  av a  2s. com*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();/*from   w ww  .  ja  va  2  s. c  om*/
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.tuplejump.stargate.lucene.json.dewey.DeweyTokenizer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String json = "{\n" + "    \"id\": 0,\n" + "    \"guid\": \"8416dc9e-6904-4787-93eb-b8038f543a04\",\n"
            + "    \"isActive\": false,\n" + "    \"balance\": \"$2,858.04\",\n"
            + "    \"picture\": \"http://placehold.it/32x32\",\n" + "    \"age\": 25,\n"
            + "    \"eyeColor\": \"brown\",\n" + "    \"name\": \"Audra Lynn\",\n"
            + "    \"gender\": \"female\",\n" + "    \"company\": \"PROSELY\",\n"
            + "    \"email\": \"audralynn@prosely.com\",\n" + "    \"phone\": \"+1 (911) 417-3322\",\n"
            + "    \"address\": \"602 Blake Avenue, Madaket, Maine, 2123\",\n"
            + "    \"about\": \"Enim dolor aliquip est voluptate sit nostrud ut. Dolore sint excepteur nulla et consequat velit cillum veniam quis ex. Consectetur reprehenderit magna minim excepteur magna laboris est sunt.\\r\\n\",\n"
            + "    \"registered\": \"2014-02-05T16:55:14 -06:-30\",\n" + "    \"latitude\": 55.275051,\n"
            + "    \"longitude\": 139.3922,\n" + "    \"tags\": [\n" + "      \"sit\",\n"
            + "      \"laboris\",\n" + "      \"do\",\n" + "      \"ad\",\n" + "      \"et\",\n"
            + "      \"reprehenderit\",\n" + "      \"aliqua\"\n" + "    ],\n" + "    \"friends\": [\n"
            + "      {\n" + "        \"id\": 0,\n" + "        \"name\": \"Fry Richmond\"\n" + "      },\n"
            + "      {\n" + "        \"id\": 1,\n" + "        \"name\": \"Olson Knight\"\n" + "      },\n"
            + "      {\n" + "        \"id\": 2,\n" + "        \"name\": \"Jimenez Dominguez\"\n" + "      }\n"
            + "    ],\n" + "    \"greeting\": \"Hello, Audra Lynn! You have 9 unread messages.\",\n"
            + "    \"favoriteFruit\": \"banana\"\n" + "  }";

    DeweyTokenizer tokenizer = new DeweyTokenizer(new StringReader(json));
    JsonTypeFilter payloadFilter = new JsonTypeFilter(Version.LUCENE_48, tokenizer);
    DeweyFieldTokenizer fieldTokenizer = new DeweyFieldTokenizer(Version.LUCENE_48, payloadFilter);

    CharTermAttribute charTermAtt = fieldTokenizer.getAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = fieldTokenizer.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = fieldTokenizer.getAttribute(PayloadAttribute.class);

    fieldTokenizer.reset();//from   www .  j a  va  2  s.co  m
    while (fieldTokenizer.incrementToken()) {
        System.out.print(charTermAtt.toString() + "\t");
        System.out.print(typeAtt.type() + "\t");
        BytesRef payload = payloadAtt.getPayload();
        if (payload != null)
            System.out.print(DeweyTokenizer.decodeLevel(payload.bytes) + "\n");
    }

}

From source file:dependencies.ReviewDependencyAnalyzer.java

License:Open Source License

public ArrayList<ArrayList<Token>> getSentences(Reader reader) {

    try {//from ww w .java 2  s.  c  o  m
        // Send reader data through the analyzer
        TokenStream tokstr = reusableTokenStream("", reader);
        TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class);
        TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class);
        FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class);
        PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class);

        // Split the tokenstream returned by the analyzer into sentences. Convert each sentence
        // into a linked list of tokens
        ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>();
        ArrayList<Token> current_sentence = new ArrayList<Token>();

        while (tokstr.incrementToken()) {
            Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(),
                    new ReviewTermPayload(tok_payload.getPayload()));
            current_sentence.add(current_token);

            // End of sentence reached. Add current sentence to the sentence list
            if (current_token.isDelim(true)) {
                if (current_sentence.size() > 1) {
                    sentence_list.add(current_sentence);
                }
                current_sentence = new ArrayList<Token>();
            }
        }

        // At the end of the token stream, if there is an incomplete sentence, add it to the
        // sentence list.
        // This case could occur when the last sentence of a given passage does not end with a
        // period or other sentence delimiter.
        if (!current_sentence.isEmpty()) {
            sentence_list.add(current_sentence);
        }

        return sentence_list;
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE,
                "Error reading data from reader. Analyzing text for typed dependencies could not be completed");
        return null;
    }
}

From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java

License:Apache License

public void test() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";

    TokenTypeSplitFilter ttsf = new TokenTypeSplitFilter(new Blah(whitespaceMockTokenizer(test)),
            Collections.singleton("even"), Collections.EMPTY_SET, "even_fork", "even_orig");
    TokenTypeSplitFilter ttsfOdd = new TokenTypeSplitFilter(ttsf, Collections.singleton("odd"),
            Collections.EMPTY_SET, "odd_fork", "odd_orig");
    TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(ttsfOdd, new String[] { "even_orig", "even_fork" },
            "joined", null, "!", false, true);
    int count = 0;
    TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ttjf.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt = ttjf.getAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class);
    String lastTerm = null;/*from   ww  w .  j a va  2  s  .  c  om*/
    int lastStartOffset = -1;
    int lastEndOffset = -1;
    ttjf.reset();
    while (ttjf.incrementToken()) {
        String term = termAtt.toString();
        String type = typeAtt.type();
        int startOffset = offsetAtt.startOffset();
        int endOffset = offsetAtt.endOffset();
        int posIncr = posIncrAtt.getPositionIncrement();
        BytesRef payload = payloadAtt.getPayload();
        switch (count % 3) {
        case 0:
            assertEquals("joined", type);
            assertEquals(1, posIncr);
            assertEquals(lastEndOffset + 1, startOffset);
            String[] split = term.split("!");
            assertEquals(split[0], split[1]);
            assertNull(payload);
            break;
        case 1:
            assertEquals("odd_orig", type);
            assertEquals(1, posIncr);
            assertEquals(lastEndOffset + 1, startOffset);
            assertNull(payload);
            break;
        case 2:
            assertEquals("odd_fork", type);
            assertEquals(lastTerm, term);
            assertEquals(0, posIncr);
            assertEquals(lastStartOffset, startOffset);
            assertEquals(lastEndOffset, endOffset);
            assertNull(payload);
            break;
        }
        lastTerm = term;
        lastStartOffset = startOffset;
        lastEndOffset = endOffset;
        count++;
    }
    assertTrue(count + " does not equal: " + 15, count == 15);

}

From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java

License:Apache License

public void testOutputComponentTypes() throws IOException {
    String test = "The quick red fox jumped over the lazy brown dogs";

    TokenTypeSplitFilter ttsf = new TokenTypeSplitFilter(new Blah(whitespaceMockTokenizer(test)),
            Collections.singleton("even"), Collections.EMPTY_SET, "even_fork", "even_orig");
    TokenTypeSplitFilter ttsfOdd = new TokenTypeSplitFilter(ttsf, Collections.singleton("odd"),
            Collections.EMPTY_SET, "odd_fork", "odd_orig");
    TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(ttsfOdd, new String[] { "even_orig", "even_fork" },
            "joined", null, "!", true, true);
    int count = 0;
    TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class);
    OffsetAttribute offsetAtt = ttjf.getAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posIncrAtt = ttjf.getAttribute(PositionIncrementAttribute.class);
    CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class);
    String lastTerm = null;// w ww . j  a v  a  2s .c  om
    int lastStartOffset = -1;
    int lastEndOffset = -1;
    ttjf.reset();
    while (ttjf.incrementToken()) {
        String term = termAtt.toString();
        String type = typeAtt.type();
        int startOffset = offsetAtt.startOffset();
        int endOffset = offsetAtt.endOffset();
        int posIncr = posIncrAtt.getPositionIncrement();
        BytesRef payload = payloadAtt.getPayload();
        switch (count % 5) {
        case 0:
            assertEquals("even_orig", type);
            assertEquals(1, posIncr);
            assertEquals(lastEndOffset + 1, startOffset);
            assertNull(payload);
            break;
        case 1:
            assertEquals("even_fork", type);
            assertEquals(lastTerm, term);
            assertEquals(0, posIncr);
            assertEquals(lastStartOffset, startOffset);
            assertEquals(lastEndOffset, endOffset);
            assertNull(payload);
            break;
        case 2:
            assertEquals("joined", type);
            assertEquals(0, posIncr);
            assertEquals(lastStartOffset, startOffset);
            String[] split = term.split("!");
            assertEquals(split[0], split[1]);
            assertNull(payload);
            break;
        case 3:
            assertEquals("odd_orig", type);
            assertEquals(1, posIncr);
            assertEquals(lastEndOffset + 1, startOffset);
            assertNull(payload);
            break;
        case 4:
            assertEquals("odd_fork", type);
            assertEquals(lastTerm, term);
            assertEquals(0, posIncr);
            assertEquals(lastStartOffset, startOffset);
            assertEquals(lastEndOffset, endOffset);
            assertNull(payload);
            break;
        }
        lastTerm = term;
        lastStartOffset = startOffset;
        lastEndOffset = endOffset;
        count++;
    }
    assertTrue(count + " does not equal: " + 25, count == 25);

}

From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java

License:Apache License

public void testVariableTokenPresence() throws IOException {
    String test = "The Quick Red Fox Jumped Over The Lazy Brown Dogs";
    TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(new Blah2(whitespaceMockTokenizer(test)),
            new String[] { "raw", "lower", "upper" }, "joined", null, "!", false, false);
    CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class);
    ttjf.reset();// w  w w.j av a2s .c o m
    int i = -1;
    String[] split = test.split(" ");
    StringBuilder sb = new StringBuilder();
    while (ttjf.incrementToken()) {
        String term = termAtt.toString();
        BytesRef payload = payloadAtt.getPayload();
        switch (++i) {
        case 0:
            assertEquals(split[i], term);
            assertNull(payload);
            break;
        case 1:
            sb.setLength(0);
            sb.append(split[i]).append('!').append(split[i].toUpperCase());
            assertEquals(sb.toString(), term);
            assertNull(payload);
            break;
        case 2:
            sb.setLength(0);
            sb.append(split[i]).append('!').append(split[i].toLowerCase()).append('!')
                    .append(split[i].toUpperCase());
            assertEquals(sb.toString(), term);
            assertNull(payload);
            break;
        }
    }
}

From source file:edu.upenn.library.solrplugins.tokentype.TokenTypeJoinFilterTest.java

License:Apache License

/** verify that payload gets picked up for 1st group of tokens */
public void testTypeForPayload1() throws IOException {
    TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(new TokenArrayTokenizer(tokensWithPayloads),
            new String[] { "normalized", "filing", "prefix" }, "joined", "normalized", "!", false, false);
    CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class);
    ttjf.reset();/*from  w ww .j a  v  a  2 s  .c  o  m*/

    assertTrue(ttjf.incrementToken());

    assertEquals("unconsoled!Unconsoled!The ", termAtt.toString());
    assertEquals("joined", typeAtt.type());
    assertEquals("payload1", payloadAtt.getPayload().utf8ToString());

    assertTrue(ttjf.incrementToken());

    assertEquals("room with a view!Room With A View!A ", termAtt.toString());
    assertEquals("joined", typeAtt.type());
    assertNull(payloadAtt.getPayload());

    assertFalse(ttjf.incrementToken());
}