List of usage examples for org.apache.lucene.analysis Token Token
public Token()
From source file:analysis.StandardTokenizer.java
License:Apache License
@SuppressWarnings("deprecation") public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; int posIncr = 1; Token result = reusableToken;//from www . j av a2 s.c om if (tokenList.size() > 0) return tokenList.remove(); while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.yylength() <= maxTokenLength) { reusableToken.clear(); reusableToken.setPositionIncrement(posIncr); scanner.getText(reusableToken); final int start = scanner.yychar(); reusableToken.setStartOffset(start); reusableToken.setEndOffset(start + reusableToken.termLength()); // This 'if' should be removed in the next release. For now, it // converts // invalid acronyms to HOST. When removed, only the 'else' part // should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.setTermLength(reusableToken.termLength() - 1); // remove // extra // '.' tokenType = StandardTokenizerImpl.HOST; } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); tokenType = StandardTokenizerImpl.ACRONYM; } } else { reusableToken.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } if (tokenType == StandardTokenizerImpl.HOST || tokenType == StandardTokenizerImpl.NUM || tokenType == StandardTokenizerImpl.ALPHANUM) { Tokenizer lt = new LetterDigitBreakTokenizer(new StringReader(reusableToken.term())); Token tk = null; int st = reusableToken.startOffset(); final Token token = new Token(); while ((tk = lt.next(token)) != null) { tk.setStartOffset(tk.startOffset() + st); tk.setEndOffset(tk.endOffset() + st); tk.setType(reusableToken.type()); tokenList.add((Token) tk.clone()); } } if (tokenList.size() > 0) result = tokenList.remove(); return result; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } }
From source file:au.edu.unimelb.csse.analyser.JsonSentenceParserTest.java
License:Apache License
public void testLong() { String jsonString = "{\"n\":\"S\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}"; JsonSentenceParser parser = new JsonSentenceParser(false); parser.parse(jsonString);//from ww w . ja v a 2s.c om Token token = new Token(); parser.next(token); assertNotNull(token); assertEquals("S", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(32, token.getPayload().byteAt(1)); assertEquals(0, token.getPayload().byteAt(2)); assertEquals(65, token.getPayload().byteAt(3)); parser.next(token); assertEquals("NP", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(2, token.getPayload().byteAt(1)); assertEquals(1, token.getPayload().byteAt(2)); assertEquals(64, token.getPayload().byteAt(3)); parser.next(token); assertEquals("NP", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(1, token.getPayload().byteAt(1)); assertEquals(2, token.getPayload().byteAt(2)); assertEquals(4, token.getPayload().byteAt(3)); parser.next(token); assertEquals("NNP", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(1, token.getPayload().byteAt(1)); assertEquals(3, token.getPayload().byteAt(2)); assertEquals(1, token.getPayload().byteAt(3)); }
From source file:au.edu.unimelb.csse.analyser.JsonSentenceParserTest.java
License:Apache License
public void testSentenceContainingEscapedDoubleQuotes() { String jsonString = "{\"n\":\"S\\\"\", \"i\":\"0_32_0_65\", \"c\":[{\"n\":\"NP\", \"i\":\"0_2_1_64\", \"c\":[{\"n\":\"NP\", \"i\":\"0_1_2_4\", \"c\":[{\"n\":\"NNP\", \"i\":\"0_1_3_1\", \"c\":[{\"n\":\"Arafat\", \"i\":\"0_1_4_0\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"1_2_2_4\", \"c\":[{\"n\":\"PRP\", \"i\":\"1_2_3_3\", \"c\":[{\"n\":\"himself\", \"i\":\"1_2_4_2\", \"c\":[]}]}]}]}, {\"n\":\"VP\", \"i\":\"2_30_1_64\", \"c\":[{\"n\":\"VBD\", \"i\":\"2_3_2_61\", \"c\":[{\"n\":\"said\", \"i\":\"2_3_3_5\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"3_30_2_61\", \"c\":[{\"n\":\"S\", \"i\":\"3_30_3_60\", \"c\":[{\"n\":\"NP\", \"i\":\"3_5_4_59\", \"c\":[{\"n\":\"DT\", \"i\":\"3_4_5_8\", \"c\":[{\"n\":\"the\", \"i\":\"3_4_6_6\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"4_5_5_8\", \"c\":[{\"n\":\"award\", \"i\":\"4_5_6_7\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"5_30_4_59\", \"c\":[{\"n\":\"VBD\", \"i\":\"5_6_5_58\", \"c\":[{\"n\":\"was\", \"i\":\"5_6_6_9\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"6_7_5_58\", \"c\":[{\"n\":\"not\", \"i\":\"6_7_6_10\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"7_30_5_58\", \"c\":[{\"n\":\"VBN\", \"i\":\"7_8_6_57\", \"c\":[{\"n\":\"granted\", \"i\":\"7_8_7_11\", \"c\":[]}]}, {\"n\":\"PP\", \"i\":\"8_30_6_57\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_7_56\", \"c\":[{\"n\":\"``\", \"i\":\"8_9_8_12\", \"c\":[]}]}, {\"n\":\"TO\", \"i\":\"9_10_7_56\", \"c\":[{\"n\":\"to\", \"i\":\"9_10_8_13\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"10_30_7_56\", \"c\":[{\"n\":\"NP\", \"i\":\"10_11_8_55\", \"c\":[{\"n\":\"NN\", \"i\":\"10_11_9_15\", \"c\":[{\"n\":\"crown\", \"i\":\"10_11_10_14\", \"c\":[]}]}]}, {\"n\":\"NP\", \"i\":\"11_26_8_55\", \"c\":[{\"n\":\"DT\", \"i\":\"11_12_9_44\", \"c\":[{\"n\":\"an\", \"i\":\"11_12_10_16\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"12_13_9_44\", \"c\":[{\"n\":\"endeavor\", \"i\":\"12_13_10_17\", \"c\":[]}]}, {\"n\":\"SBAR\", \"i\":\"13_26_9_44\", \"c\":[{\"n\":\"IN\", \"i\":\"13_14_10_43\", \"c\":[{\"n\":\"that\", \"i\":\"13_14_11_18\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"14_26_10_43\", \"c\":[{\"n\":\"NP\", \"i\":\"14_15_11_42\", \"c\":[{\"n\":\"PRP\", \"i\":\"14_15_12_20\", \"c\":[{\"n\":\"we\", \"i\":\"14_15_13_19\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"15_26_11_42\", \"c\":[{\"n\":\"VP\", \"i\":\"15_17_12_41\", \"c\":[{\"n\":\"VBP\", \"i\":\"15_16_13_24\", \"c\":[{\"n\":\"have\", \"i\":\"15_16_14_21\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"16_17_13_24\", \"c\":[{\"n\":\"VBN\", \"i\":\"16_17_14_23\", \"c\":[{\"n\":\"completed\", \"i\":\"16_17_15_22\", \"c\":[]}]}]}]}, {\"n\":\"CC\", \"i\":\"17_18_12_41\", \"c\":[{\"n\":\"but\", \"i\":\"17_18_13_25\", \"c\":[]}]}, {\"n\":\"RB\", \"i\":\"18_19_12_41\", \"c\":[{\"n\":\"rather\", \"i\":\"18_19_13_26\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"19_26_12_41\", \"c\":[{\"n\":\"TO\", \"i\":\"19_20_13_40\", \"c\":[{\"n\":\"to\", \"i\":\"19_20_14_27\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"20_26_13_40\", \"c\":[{\"n\":\"VB\", \"i\":\"20_21_14_39\", \"c\":[{\"n\":\"encourage\", \"i\":\"20_21_15_28\", \"c\":[]}]}, {\"n\":\"S\", \"i\":\"21_26_14_39\", \"c\":[{\"n\":\"NP\", \"i\":\"21_22_15_38\", \"c\":[{\"n\":\"PRP\", \"i\":\"21_22_16_30\", \"c\":[{\"n\":\"us\", \"i\":\"21_22_17_29\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"22_26_15_38\", \"c\":[{\"n\":\"TO\", \"i\":\"22_23_16_37\", \"c\":[{\"n\":\"to\", \"i\":\"22_23_17_31\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"23_26_16_37\", \"c\":[{\"n\":\"VB\", \"i\":\"23_24_17_36\", \"c\":[{\"n\":\"continue\", \"i\":\"23_24_18_32\", \"c\":[]}]}, {\"n\":\"NP\", \"i\":\"24_26_17_36\", \"c\":[{\"n\":\"DT\", \"i\":\"24_25_18_35\", \"c\":[{\"n\":\"a\", \"i\":\"24_25_19_33\", \"c\":[]}]}, {\"n\":\"NN\", \"i\":\"25_26_18_35\", \"c\":[{\"n\":\"road\", \"i\":\"25_26_19_34\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\"SBAR\", \"i\":\"26_30_8_55\", \"c\":[{\"n\":\"WHNP\", \"i\":\"26_27_9_54\", \"c\":[{\"n\":\"WDT\", \"i\":\"26_27_10_46\", \"c\":[{\"n\":\"which\", \"i\":\"26_27_11_45\", \"c\":[]}]}]}, {\"n\":\"S\", \"i\":\"27_30_9_54\", \"c\":[{\"n\":\"NP\", \"i\":\"27_28_10_53\", \"c\":[{\"n\":\"PRP\", \"i\":\"27_28_11_48\", \"c\":[{\"n\":\"we\", \"i\":\"27_28_12_47\", \"c\":[]}]}]}, {\"n\":\"VP\", \"i\":\"28_30_10_53\", \"c\":[{\"n\":\"VBP\", \"i\":\"28_29_11_52\", \"c\":[{\"n\":\"have\", \"i\":\"28_29_12_49\", \"c\":[]}]}, {\"n\":\"VP\", \"i\":\"29_30_11_52\", \"c\":[{\"n\":\"VBN\", \"i\":\"29_30_12_51\", \"c\":[{\"n\":\"started\", \"i\":\"29_30_13_50\", \"c\":[]}]}]}]}]}]}]}]}]}]}]}]}]}, {\"n\":\".\", \"i\":\"30_31_1_64\", \"c\":[{\"n\":\".\", \"i\":\"30_31_2_62\", \"c\":[]}]}, {\"n\":\"''\", \"i\":\"31_32_1_64\", \"c\":[{\"n\":\"''\", \"i\":\"31_32_2_63\", \"c\":[]}]}]}"; JsonSentenceParser parser = new JsonSentenceParser(false); parser.parse(jsonString);//w w w.ja v a 2 s . co m Token token = new Token(); parser.next(token); assertNotNull(token); assertEquals("S\"", token.term()); assertEquals(0, token.getPayload().byteAt(0)); assertEquals(32, token.getPayload().byteAt(1)); assertEquals(0, token.getPayload().byteAt(2)); assertEquals(65, token.getPayload().byteAt(3)); }
From source file:au.edu.unimelb.csse.analyser.String2NodesParserTest.java
License:Apache License
public void test10SentsFromWiki() throws ParseException, IOException { String2NodesParser parser = new String2NodesParser(); //we wont have this case ever because the earlier step removes spaces.. //the newer version of string2nodesparser handles spaces more elegantly // String sent1 = // "( NP ( NP ( NNP David ) (NNP Arthur) (NNP Wales)) (, ,) (NP (ADJP (JJ a.k.a.) (SBAR (S (NP (NP (NP (NNP David) (NNP Wales))) (CC or) (NP (NP (NNP David) (NNP Art) (NNP Wales)) (PRN (-LRB- -LRB-) (VP (VBN born) (NP (NP (CD 6) (NNP February) (CD 1964)) (, ,) (NP (NP (NNP Sydney))))) (-RRB- -RRB-)))) (VP (VBZ is) (NP (NP (DT an) (JJ Australian) (NN entrepreneur) (CC and) (NN artist))) (ADJP (JJS best) (VBN known) (PP (IN for) (S (VP (VBG creating) (NP (NP (JJ satirical) (NN cult) (NN figure))))))))))) (NNP Guru) (NNP Adrian)) (. .))"; String sent1 = "(NP(NP(NNP David)(NNP Arthur)(NNP Wales))(, ,)(NP(ADJP(JJ a.k.a.)(SBAR(S(NP(NP(NP(NNP David)(NNP Wales)))(CC or)(NP(NP(NNP David)(NNP Art)(NNP Wales))(PRN(-LRB- -LRB-)(VP(VBN born)(NP(NP(CD 6)(NNP February)(CD 1964))(, ,)(NP(NP(NNP Sydney)))))(-RRB- -RRB-))))(VP(VBZ is)(NP(NP(DT an)(JJ Australian)(NN entrepreneur)(CC and)(NN artist)))(ADJP(JJS best)(VBN known)(PP(IN for)(S(VP(VBG creating)(NP(NP(JJ satirical)(NN cult)(NN figure)))))))))))(NNP Guru)(NNP Adrian))(. .))"; Node parsed = parser.parse(sent1); assertNotNull(parsed);/*from ww w. j a v a 2 s . c o m*/ assertNode(parsed, "NP", 4, 0, 35, 0, 61); Node first = parsed.children.get(0); assertNode(first, "NP", 3, 0, 3, 1, 60); assertNode(first.children.get(0), "NNP", 1, 0, 1, 2, 3); assertNode(first.children.get(0).children.get(0), "David", 0, 0, 1, 3, 0); assertNode(first.children.get(1), "NNP", 1, 1, 2, 2, 3); assertNode(first.children.get(1).children.get(0), "Arthur", 0, 1, 2, 3, 1); assertNode(first.children.get(2), "NNP", 1, 2, 3, 2, 3); assertNode(first.children.get(2).children.get(0), "Wales", 0, 2, 3, 3, 2); assertNode(parsed.children.get(1), ",", 1, 3, 4, 1, 60); assertEquals(96, parsed.totalNumberOfNodes()); Token token = new Token(); for (int i = 0; i < 96; i++) { assertNotNull(parser.next(token)); } assertNull(parser.next(token)); String sent2 = "(S(PP(IN During)(NP(DT the)(CD 1980s)))(NP(PRP he))(VP(VBD was)(NP(NP(DT a)(JJ frequent)(NN contributor))(PP(TO to)(NP(JJ Australian)(NN radio)(NN station)(NNP Triple)(NNP Jay))))(, ,)(S(VP(VBG providing)(NP(NP(NP(NN commentary))(PP(IN on)(NP(NP(JJ pop-cultural)(NNS issues))(, ,)(PP(VBG including)(NP(NP(DT a)(JJ live)(NN report))(PP(IN from)(NP(NNP Berlin)))(SBAR(IN as)(S(NP(DT the)(NNP Berlin)(NNP Wall))(VP(VBD fell)))))))))(, ,)(CC and)(NP(NP(DT a)(JJ comic)(NN strip))(VP(VBG featuring)(NP(NNP Guru)(NNP Adrian))(PP(IN for)(NP(NP(NP(DT the)(NN station)(POS 's))(NN fanzine))(, ,)(NP(NNP Alan))))))))))(. .))"; parsed = parser.parse(sent2); assertEquals(131, parsed.totalNumberOfNodes()); for (int i = 0; i < 131; i++) { assertNotNull(parser.next(token)); } assertNull(parser.next(token)); String sent3 = "(S(NP(NNP Wales))(VP(VP(VBD moved)(PP(TO to)(NP(NNP New)(NNP York)))(S(VP(TO to)(VP(VB become)(NP(NP(DT a)(NN painter))(PP(IN in)(NP(CD 1989))))))))(CC and)(VP(VBD spent)(NP(DT the)(CD 90s)(NN showing))(PP(IN at)(NP(NP(JJ various)(NNP Manhattan))(CC and)(NP(NP(JJ Australian)(NNS galleries))(, ,)(PP(VBG including)(NP(NNP Roslyn)(NNP Oxley)(CD 9)(CC and)(NNP Sherman)(NNPS Galleries))))))))(. .))"; parsed = parser.parse(sent3); assertEquals(84, parsed.totalNumberOfNodes()); for (int i = 0; i < 84; i++) { assertNotNull(parser.next(token)); } assertNull(parser.next(token)); String sent4 = "NP(NP(CD Eight)(NNS campuses))(PP(IN outside)(NP(DT the)(NNP Guadalajara)(NNP Metropolitan)(NNP Area)))(PP(IN within)(NP(NP(DT the)(NNP State))(PP(IN of)(NP(NNP Jalisco)))))(. .)"; parsed = parser.parse(sent4); }
From source file:au.edu.unimelb.csse.analyser.String2NodesParserTest.java
License:Apache License
public void testReturnsNodesInOrder() throws ParseException, IOException { String2NodesParser tokenizer = new String2NodesParser(); Node n = tokenizer.parse("(A(B(C D)(E F))(G H))"); assertEquals("A", n.label()); Token token = new Token(); Token ret;//from w w w. j a v a 2 s. co m ret = tokenizer.next(token); assertNotNull(ret); assertEquals("A", token.term()); assertPayload(token, 3, 0, 0, 5); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("B", token.term()); assertPayload(token, 2, 0, 1, 4); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("C", token.term()); assertPayload(token, 1, 0, 2, 2); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("D", token.term()); assertPayload(token, 1, 0, 3, 0); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("E", token.term()); assertPayload(token, 2, 1, 2, 2); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("F", token.term()); assertPayload(token, 2, 1, 3, 1); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("G", token.term()); assertPayload(token, 3, 2, 1, 4); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("H", token.term()); assertPayload(token, 3, 2, 2, 3); ret = tokenizer.next(token); assertNull(ret); tokenizer.parse("(K(L M)(N O))"); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("K", token.term()); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("L", token.term()); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("M", token.term()); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("N", token.term()); ret = tokenizer.next(token); assertNotNull(ret); assertEquals("O", token.term()); ret = tokenizer.next(token); assertNull(ret); }
From source file:com.flaptor.hounder.classifier.util.DocumentParser.java
License:Apache License
/** * Transforms the document in an array of tokens and counts the number of * ocurrencies of each token.//from w ww . j av a 2 s . c o m * @param doc the document represented as a string * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return. * Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the returned * map will contain values for each fo the following: t1, t2, t1_t2, t2_t3 * If maxTuple <1 then maxTuple=1. * @return a map that binds each token with the count of ocurrencies within * the document * @see {@link TupleTokenizer}{@link #parse(String, int)} * The map should be '<String,int>'. But int can't be inserted to a Map, and * Integer is unmodifiable. So this awful hack uses an int[] to be able to * add an int and change it's value easily during the calculation. */ public static Map<String, int[]> parse(String doc, int maxTuple) { // TODO: Use Integer instead int[]. Map<String, int[]> tokenCount = new HashMap<String, int[]>(); // TODO: Decouple from lucene, allow the analyzer to be configurable. // TODO: Verifiy that it is necessary to create a new analyzer instance each time. Analyzer analyzer = new StandardAnalyzer(); Reader docReader = new StringReader(doc); TokenStream tokenStream = analyzer.tokenStream(null, docReader); try { if (1 < maxTuple) { tokenStream = new TupleTokenizer(tokenStream, maxTuple); } Token token = new Token(); while ((token = tokenStream.next(token)) != null) { String term = TokenUtil.termText(token); int[] count = tokenCount.get(term); if (count == null) { count = new int[] { 0 }; tokenCount.put(term, count); } else { count[0]++; } } } catch (IOException e) { System.err.println("parse: couldn't parse document " + e); } finally { try { tokenStream.close(); } catch (IOException e) { System.err.println("close: " + e); } } return tokenCount; }
From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java
License:Apache License
/** * @throws IOException /*from w w w. ja v a 2s . c o m*/ * @param ts the TokenStream to wrap * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return. * Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the return wil * be "t1 t2 t1_t2 t2_t3" * */ public TupleTokenizer(TokenStream ts, int maxTuples) throws IOException { MAX_INCREMENT = maxTuples; Token tk = new Token(); while ((tk = ts.next(tk)) != null) { tokens.add((Token) tk.clone()); } }
From source file:com.flaptor.hounder.classifier.util.TupleTokenizer.java
License:Apache License
private Token mergeTokens(Token t1, Token t2) { Token res = new Token(); if (null == t1) { return t2; }/* w w w . j a v a2 s .c o m*/ char[] text = (TokenUtil.termText(t1) + "_" + TokenUtil.termText(t2)).toCharArray(); res.reinit(text, 0, text.length, t1.startOffset(), t2.endOffset()); return res; }
From source file:com.flaptor.hounder.classifier.util.TupleTokenizerTest.java
License:Apache License
@TestInfo(testType = TestInfo.TestType.UNIT) public void testNext() throws IOException { TupleTokenizer tt = new TupleTokenizer(new MockTokenStrem(), 3); Token t = new Token(); assertEquals("t1", TokenUtil.termText(tt.next(t))); assertEquals("t2", TokenUtil.termText(tt.next(t))); assertEquals("t3", TokenUtil.termText(tt.next(t))); assertEquals("t4", TokenUtil.termText(tt.next(t))); assertEquals("t5", TokenUtil.termText(tt.next(t))); assertEquals("t6", TokenUtil.termText(tt.next(t))); assertEquals("t7", TokenUtil.termText(tt.next(t))); assertEquals("t8", TokenUtil.termText(tt.next(t))); assertEquals("t9", TokenUtil.termText(tt.next(t))); assertEquals("t10", TokenUtil.termText(tt.next(t))); assertEquals("t1_t2", TokenUtil.termText(tt.next(t))); assertEquals("t2_t3", TokenUtil.termText(tt.next(t))); assertEquals("t3_t4", TokenUtil.termText(tt.next(t))); assertEquals("t4_t5", TokenUtil.termText(tt.next(t))); assertEquals("t5_t6", TokenUtil.termText(tt.next(t))); assertEquals("t6_t7", TokenUtil.termText(tt.next(t))); assertEquals("t7_t8", TokenUtil.termText(tt.next(t))); assertEquals("t8_t9", TokenUtil.termText(tt.next(t))); assertEquals("t9_t10", TokenUtil.termText(tt.next(t))); assertEquals("t1_t2_t3", TokenUtil.termText(tt.next(t))); assertEquals("t2_t3_t4", TokenUtil.termText(tt.next(t))); assertEquals("t3_t4_t5", TokenUtil.termText(tt.next(t))); assertEquals("t4_t5_t6", TokenUtil.termText(tt.next(t))); assertEquals("t5_t6_t7", TokenUtil.termText(tt.next(t))); assertEquals("t6_t7_t8", TokenUtil.termText(tt.next(t))); assertEquals("t7_t8_t9", TokenUtil.termText(tt.next(t))); assertEquals("t8_t9_t10", TokenUtil.termText(tt.next(t))); assertNull(tt.next(t));/* www .j a v a 2 s. c om*/ }
From source file:com.flaptor.hounder.searcher.query.AQuerySuggestor.java
License:Apache License
private List<AQuery> suggestLinear(AQuery query) { List<AQuery> queries = new ArrayList<AQuery>(); if (null == query) { logger.debug("Can't make a suggestion for a null query"); } else if (!(query instanceof LazyParsedQuery)) { // TODO FIXME logger.debug("can not make suggestions for queries of type " + query.getClass()); } else {/* w w w. jav a 2 s .co m*/ String originalString = ((LazyParsedQuery) query).getQueryString(); StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(originalString)); List<String> tokens = new ArrayList<String>(); try { Token token = new Token(); while (true) { token = tokenizer.next(token); if (null == token) { break; } tokens.add(TokenUtil.termText((Token) token.clone())); } // for every word, suggest something for (int i = 0; i < tokens.size(); i++) { StringBuffer sb = new StringBuffer(); // sb.append("\""); for (int j = 0; j < i; j++) { sb.append(tokens.get(j)); sb.append(" "); } String[] suggestions = suggestor.suggestWords(tokens.get(i)); for (String suggestion : suggestions) { // generate final sb StringBuffer sbf = new StringBuffer(sb); sbf.append(suggestion); sbf.append(" "); for (int k = i + 1; k < tokens.size(); k++) { sbf.append(tokens.get(k)); if (k + 1 < tokens.size()) { sbf.append(" "); } } // sbf.append("\""); queries.add(new LazyParsedQuery(sbf.toString())); } } } catch (IOException e) { logger.error("Error while suggesting query", e); return new ArrayList<AQuery>(); } } return queries; }