Example usage for org.apache.lucene.analysis Token Token

List of usage examples for org.apache.lucene.analysis Token Token

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Token Token.

Prototype

public Token() 

Source Link

Document

Constructs a Token will null text.

Usage

From source file:analysis.StandardTokenizer.java

License:Apache License

@SuppressWarnings("deprecation")
public Token next(final Token reusableToken) throws IOException {
    assert reusableToken != null;
    int posIncr = 1;
    Token result = reusableToken;//from  www  .  j  av a2 s.c om
    if (tokenList.size() > 0)
        return tokenList.remove();
    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerImpl.YYEOF) {
            return null;
        }

        if (scanner.yylength() <= maxTokenLength) {
            reusableToken.clear();
            reusableToken.setPositionIncrement(posIncr);
            scanner.getText(reusableToken);
            final int start = scanner.yychar();
            reusableToken.setStartOffset(start);
            reusableToken.setEndOffset(start + reusableToken.termLength());
            // This 'if' should be removed in the next release. For now, it
            // converts
            // invalid acronyms to HOST. When removed, only the 'else' part
            // should
            // remain.
            if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
                if (replaceInvalidAcronym) {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
                    reusableToken.setTermLength(reusableToken.termLength() - 1); // remove
                    // extra
                    // '.'
                    tokenType = StandardTokenizerImpl.HOST;
                } else {
                    reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
                    tokenType = StandardTokenizerImpl.ACRONYM;
                }
            } else {

                reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
            }

            if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM
                    || tokenType == StandardTokenizerImpl.ALPHANUM) {

                Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term()));
                Token tk = null;
                int st = reusableToken.startOffset();
                final Token token = new Token();
                while ((tk = lt.next(token)) != null) {
                    tk.setStartOffset(tk.startOffset() + st);
                    tk.setEndOffset(tk.endOffset() + st);
                    tk.setType(reusableToken.type());
                    tokenList.add((Token) tk.clone());
                }
            }
            if (tokenList.size() > 0)
                result = tokenList.remove();

            return result;
        } else
            // When we skip a too-long term, we still increment the
            // position increment
            posIncr++;
    }
}

From source file:au.edu.unimelb.csse.analyser.JsonSentenceParserTest.java

License:Apache License

public void testLong() {
    String jsonString = "{\"n\":\"S\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}";
    JsonSentenceParser parser = new JsonSentenceParser(false);
    parser.parse(jsonString);//from   ww  w  .  ja  v  a 2s.c  om
    Token token = new Token();
    parser.next(token);
    assertNotNull(token);
    assertEquals("S", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(32, token.getPayload().byteAt(1));
    assertEquals(0, token.getPayload().byteAt(2));
    assertEquals(65, token.getPayload().byteAt(3));

    parser.next(token);
    assertEquals("NP", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(2, token.getPayload().byteAt(1));
    assertEquals(1, token.getPayload().byteAt(2));
    assertEquals(64, token.getPayload().byteAt(3));

    parser.next(token);
    assertEquals("NP", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(1, token.getPayload().byteAt(1));
    assertEquals(2, token.getPayload().byteAt(2));
    assertEquals(4, token.getPayload().byteAt(3));

    parser.next(token);
    assertEquals("NNP", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(1, token.getPayload().byteAt(1));
    assertEquals(3, token.getPayload().byteAt(2));
    assertEquals(1, token.getPayload().byteAt(3));
}

From source file:au.edu.unimelb.csse.analyser.JsonSentenceParserTest.java

License:Apache License

public void testSentenceContainingEscapedDoubleQuotes() {
    String jsonString = "{\"n\":\"S\\\"\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}";
    JsonSentenceParser parser = new JsonSentenceParser(false);
    parser.parse(jsonString);//w  w  w.ja v a  2  s . co  m
    Token token = new Token();
    parser.next(token);
    assertNotNull(token);
    assertEquals("S\"", token.term());
    assertEquals(0, token.getPayload().byteAt(0));
    assertEquals(32, token.getPayload().byteAt(1));
    assertEquals(0, token.getPayload().byteAt(2));
    assertEquals(65, token.getPayload().byteAt(3));
}

From source file:au.edu.unimelb.csse.analyser.String2NodesParserTest.java

License:Apache License

public void test10SentsFromWiki() throws ParseException, IOException {
    String2NodesParser parser = new String2NodesParser();
    //we wont have this case ever because the earlier step removes spaces..
    //the newer version of string2nodesparser handles spaces more elegantly
    // String sent1 =
    // "( NP ( NP ( NNP David ) (NNP Arthur) (NNP Wales)) (, ,) (NP (ADJP (JJ a.k.a.) (SBAR (S (NP (NP (NP (NNP David) (NNP Wales))) (CC or) (NP (NP (NNP David) (NNP Art) (NNP Wales)) (PRN (-LRB- -LRB-) (VP (VBN born) (NP (NP (CD 6) (NNP February) (CD 1964)) (, ,) (NP (NP (NNP Sydney))))) (-RRB- -RRB-)))) (VP (VBZ is) (NP (NP (DT an) (JJ Australian) (NN entrepreneur) (CC and) (NN artist))) (ADJP (JJS best) (VBN known) (PP (IN for) (S (VP (VBG creating) (NP (NP (JJ satirical) (NN cult) (NN figure))))))))))) (NNP Guru) (NNP Adrian)) (. .))";
    String sent1 = "(NP(NP(NNP David)(NNP Arthur)(NNP Wales))(, ,)(NP(ADJP(JJ a.k.a.)(SBAR(S(NP(NP(NP(NNP David)(NNP Wales)))(CC or)(NP(NP(NNP David)(NNP Art)(NNP Wales))(PRN(-LRB- -LRB-)(VP(VBN born)(NP(NP(CD 6)(NNP February)(CD 1964))(, ,)(NP(NP(NNP Sydney)))))(-RRB- -RRB-))))(VP(VBZ is)(NP(NP(DT an)(JJ Australian)(NN entrepreneur)(CC and)(NN artist)))(ADJP(JJS best)(VBN known)(PP(IN for)(S(VP(VBG creating)(NP(NP(JJ satirical)(NN cult)(NN figure)))))))))))(NNP Guru)(NNP Adrian))(. .))";
    Node parsed = parser.parse(sent1);
    assertNotNull(parsed);/*from  ww  w.  j  a v  a  2  s . c o m*/

    assertNode(parsed, "NP", 4, 0, 35, 0, 61);

    Node first = parsed.children.get(0);
    assertNode(first, "NP", 3, 0, 3, 1, 60);

    assertNode(first.children.get(0), "NNP", 1, 0, 1, 2, 3);
    assertNode(first.children.get(0).children.get(0), "David", 0, 0, 1, 3, 0);

    assertNode(first.children.get(1), "NNP", 1, 1, 2, 2, 3);
    assertNode(first.children.get(1).children.get(0), "Arthur", 0, 1, 2, 3, 1);

    assertNode(first.children.get(2), "NNP", 1, 2, 3, 2, 3);
    assertNode(first.children.get(2).children.get(0), "Wales", 0, 2, 3, 3, 2);

    assertNode(parsed.children.get(1), ",", 1, 3, 4, 1, 60);

    assertEquals(96, parsed.totalNumberOfNodes());

    Token token = new Token();
    for (int i = 0; i < 96; i++) {
        assertNotNull(parser.next(token));
    }

    assertNull(parser.next(token));

    String sent2 = "(S(PP(IN During)(NP(DT the)(CD 1980s)))(NP(PRP he))(VP(VBD was)(NP(NP(DT a)(JJ frequent)(NN contributor))(PP(TO to)(NP(JJ Australian)(NN radio)(NN station)(NNP Triple)(NNP Jay))))(, ,)(S(VP(VBG providing)(NP(NP(NP(NN commentary))(PP(IN on)(NP(NP(JJ pop-cultural)(NNS issues))(, ,)(PP(VBG including)(NP(NP(DT a)(JJ live)(NN report))(PP(IN from)(NP(NNP Berlin)))(SBAR(IN as)(S(NP(DT the)(NNP Berlin)(NNP Wall))(VP(VBD fell)))))))))(, ,)(CC and)(NP(NP(DT a)(JJ comic)(NN strip))(VP(VBG featuring)(NP(NNP Guru)(NNP Adrian))(PP(IN for)(NP(NP(NP(DT the)(NN station)(POS 's))(NN fanzine))(, ,)(NP(NNP Alan))))))))))(. .))";

    parsed = parser.parse(sent2);

    assertEquals(131, parsed.totalNumberOfNodes());

    for (int i = 0; i < 131; i++) {
        assertNotNull(parser.next(token));
    }

    assertNull(parser.next(token));

    String sent3 = "(S(NP(NNP Wales))(VP(VP(VBD moved)(PP(TO to)(NP(NNP New)(NNP York)))(S(VP(TO to)(VP(VB become)(NP(NP(DT a)(NN painter))(PP(IN in)(NP(CD 1989))))))))(CC and)(VP(VBD spent)(NP(DT the)(CD 90s)(NN showing))(PP(IN at)(NP(NP(JJ various)(NNP Manhattan))(CC and)(NP(NP(JJ Australian)(NNS galleries))(, ,)(PP(VBG including)(NP(NNP Roslyn)(NNP Oxley)(CD 9)(CC and)(NNP Sherman)(NNPS Galleries))))))))(. .))";

    parsed = parser.parse(sent3);

    assertEquals(84, parsed.totalNumberOfNodes());

    for (int i = 0; i < 84; i++) {
        assertNotNull(parser.next(token));
    }

    assertNull(parser.next(token));

    String sent4 = "NP(NP(CD Eight)(NNS campuses))(PP(IN outside)(NP(DT the)(NNP Guadalajara)(NNP Metropolitan)(NNP Area)))(PP(IN within)(NP(NP(DT the)(NNP State))(PP(IN of)(NP(NNP Jalisco)))))(. .)";

    parsed = parser.parse(sent4);
}

From source file:au.edu.unimelb.csse.analyser.String2NodesParserTest.java

License:Apache License

public void testReturnsNodesInOrder() throws ParseException, IOException {
    String2NodesParser tokenizer = new String2NodesParser();
    Node n = tokenizer.parse("(A(B(C D)(E F))(G H))");
    assertEquals("A", n.label());
    Token token = new Token();
    Token ret;//from  w w  w. j  a v a 2 s.  co m
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("A", token.term());
    assertPayload(token, 3, 0, 0, 5);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("B", token.term());
    assertPayload(token, 2, 0, 1, 4);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("C", token.term());
    assertPayload(token, 1, 0, 2, 2);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("D", token.term());
    assertPayload(token, 1, 0, 3, 0);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("E", token.term());
    assertPayload(token, 2, 1, 2, 2);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("F", token.term());
    assertPayload(token, 2, 1, 3, 1);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("G", token.term());
    assertPayload(token, 3, 2, 1, 4);
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("H", token.term());
    assertPayload(token, 3, 2, 2, 3);
    ret = tokenizer.next(token);
    assertNull(ret);

    tokenizer.parse("(K(L M)(N O))");
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("K", token.term());
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("L", token.term());
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("M", token.term());
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("N", token.term());
    ret = tokenizer.next(token);
    assertNotNull(ret);
    assertEquals("O", token.term());
    ret = tokenizer.next(token);
    assertNull(ret);
}

From source file:com.flaptor.hounder.classifier.util.DocumentParser.java

License:Apache License

/**
 * Transforms the document in an array of tokens and counts the number of 
 * ocurrencies of each token.//from w ww . j av a 2  s . c o m
 * @param doc the document represented as a string
 * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return.
 *  Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the returned
 *  map will contain values for each fo the following: t1, t2, t1_t2, t2_t3 
 *  If maxTuple <1 then maxTuple=1.
 * @return a map that binds each token with the count of ocurrencies within
 *  the document
 * @see {@link TupleTokenizer}{@link #parse(String, int)}
 * The map should be '<String,int>'. But int can't be inserted to a Map, and
 * Integer is unmodifiable. So this awful hack uses an int[] to be able to
 * add an int and change it's value easily during the calculation.
 */
public static Map<String, int[]> parse(String doc, int maxTuple) {

    // TODO: Use Integer instead int[].
    Map<String, int[]> tokenCount = new HashMap<String, int[]>();

    // TODO: Decouple from lucene, allow the analyzer to be configurable.
    // TODO: Verifiy that it is necessary to create a new analyzer instance each time.
    Analyzer analyzer = new StandardAnalyzer();
    Reader docReader = new StringReader(doc);
    TokenStream tokenStream = analyzer.tokenStream(null, docReader);

    try {
        if (1 < maxTuple) {
            tokenStream = new TupleTokenizer(tokenStream, maxTuple);
        }
        Token token = new Token();
        while ((token = tokenStream.next(token)) != null) {
            String term = TokenUtil.termText(token);
            int[] count = tokenCount.get(term);
            if (count == null) {
                count = new int[] { 0 };
                tokenCount.put(term, count);
            } else {
                count[0]++;
            }
        }
    } catch (IOException e) {
        System.err.println("parse: couldn't parse document " + e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            System.err.println("close: " + e);
        }
    }

    return tokenCount;
}

From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java

License:Apache License

/**
 * @throws IOException /*from w  w  w.  ja  v a 2s . c  o m*/
 * @param ts the TokenStream to wrap
 * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return.
 *  Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the return wil
 *  be "t1 t2 t1_t2 t2_t3"
 * 
 */
public TupleTokenizer(TokenStream ts, int maxTuples) throws IOException {

    MAX_INCREMENT = maxTuples;
    Token tk = new Token();
    while ((tk = ts.next(tk)) != null) {
        tokens.add((Token) tk.clone());
    }
}

From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java

License:Apache License

private Token mergeTokens(Token t1, Token t2) {
    Token res = new Token();
    if (null == t1) {
        return t2;
    }/* w w  w .  j a  v  a2 s  .c  o  m*/
    char[] text = (TokenUtil.termText(t1) + "_" + TokenUtil.termText(t2)).toCharArray();
    res.reinit(text, 0, text.length, t1.startOffset(), t2.endOffset());
    return res;
}

From source file:com.flaptor.hounder.classifier.util.TupleTokenizerTest.java

License:Apache License

@TestInfo(testType = TestInfo.TestType.UNIT)
public void testNext() throws IOException {
    TupleTokenizer tt = new TupleTokenizer(new MockTokenStrem(), 3);
    Token t = new Token();
    assertEquals("t1", TokenUtil.termText(tt.next(t)));
    assertEquals("t2", TokenUtil.termText(tt.next(t)));
    assertEquals("t3", TokenUtil.termText(tt.next(t)));
    assertEquals("t4", TokenUtil.termText(tt.next(t)));
    assertEquals("t5", TokenUtil.termText(tt.next(t)));
    assertEquals("t6", TokenUtil.termText(tt.next(t)));
    assertEquals("t7", TokenUtil.termText(tt.next(t)));
    assertEquals("t8", TokenUtil.termText(tt.next(t)));
    assertEquals("t9", TokenUtil.termText(tt.next(t)));
    assertEquals("t10", TokenUtil.termText(tt.next(t)));

    assertEquals("t1_t2", TokenUtil.termText(tt.next(t)));
    assertEquals("t2_t3", TokenUtil.termText(tt.next(t)));
    assertEquals("t3_t4", TokenUtil.termText(tt.next(t)));
    assertEquals("t4_t5", TokenUtil.termText(tt.next(t)));
    assertEquals("t5_t6", TokenUtil.termText(tt.next(t)));
    assertEquals("t6_t7", TokenUtil.termText(tt.next(t)));
    assertEquals("t7_t8", TokenUtil.termText(tt.next(t)));
    assertEquals("t8_t9", TokenUtil.termText(tt.next(t)));
    assertEquals("t9_t10", TokenUtil.termText(tt.next(t)));

    assertEquals("t1_t2_t3", TokenUtil.termText(tt.next(t)));
    assertEquals("t2_t3_t4", TokenUtil.termText(tt.next(t)));
    assertEquals("t3_t4_t5", TokenUtil.termText(tt.next(t)));
    assertEquals("t4_t5_t6", TokenUtil.termText(tt.next(t)));
    assertEquals("t5_t6_t7", TokenUtil.termText(tt.next(t)));
    assertEquals("t6_t7_t8", TokenUtil.termText(tt.next(t)));
    assertEquals("t7_t8_t9", TokenUtil.termText(tt.next(t)));
    assertEquals("t8_t9_t10", TokenUtil.termText(tt.next(t)));

    assertNull(tt.next(t));/* www  .j  a v  a  2  s.  c  om*/
}

From source file:com.flaptor.hounder.searcher.query.AQuerySuggestor.java

License:Apache License

private List<AQuery> suggestLinear(AQuery query) {
    List<AQuery> queries = new ArrayList<AQuery>();
    if (null == query) {
        logger.debug("Can't make a suggestion for a null query");
    } else if (!(query instanceof LazyParsedQuery)) {
        // TODO FIXME
        logger.debug("can not make suggestions for queries of type " + query.getClass());
    } else {/*  w w w. jav  a  2 s  .co  m*/
        String originalString = ((LazyParsedQuery) query).getQueryString();
        StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(originalString));
        List<String> tokens = new ArrayList<String>();
        try {
            Token token = new Token();
            while (true) {
                token = tokenizer.next(token);
                if (null == token) {
                    break;
                }
                tokens.add(TokenUtil.termText((Token) token.clone()));
            }

            // for every word, suggest something
            for (int i = 0; i < tokens.size(); i++) {
                StringBuffer sb = new StringBuffer();
                //                    sb.append("\"");
                for (int j = 0; j < i; j++) {
                    sb.append(tokens.get(j));
                    sb.append(" ");
                }
                String[] suggestions = suggestor.suggestWords(tokens.get(i));
                for (String suggestion : suggestions) {
                    // generate final sb
                    StringBuffer sbf = new StringBuffer(sb);
                    sbf.append(suggestion);
                    sbf.append(" ");
                    for (int k = i + 1; k < tokens.size(); k++) {
                        sbf.append(tokens.get(k));
                        if (k + 1 < tokens.size()) {
                            sbf.append(" ");
                        }
                    }
                    //                        sbf.append("\"");
                    queries.add(new LazyParsedQuery(sbf.toString()));
                }
            }

        } catch (IOException e) {
            logger.error("Error while suggesting query", e);
            return new ArrayList<AQuery>();
        }
    }
    return queries;
}