List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java
License:Open Source License
public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode, final int[] expectedPos) throws Exception { final TokenStream t = a.tokenStream("", new StringReader(input)); t.reset();//from w w w. java 2s. c o m assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null; if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } NodeAttribute nodeAtt = null; if (expectedNode != null) { assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class)); nodeAtt = t.getAttribute(NodeAttribute.class); } PositionAttribute posAtt = null; if (expectedPos != null) { assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class)); posAtt = t.getAttribute(PositionAttribute.class); } for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals("i=" + i, expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedNode != null) { assertEquals(expectedNode[i], nodeAtt.node()); } if (expectedPos != null) { assertEquals(expectedPos[i], posAtt.position()); } } assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken()); t.end(); t.close(); }
From source file:com.sindicetech.siren.analysis.TestConciseJsonAnalyzer.java
License:Open Source License
@Test public void testNumeric() throws Exception { _a.registerDatatype(XSDDatatype.XSD_LONG.toCharArray(), new LongNumericAnalyzer(64)); final TokenStream t = _a.tokenStream("", new StringReader("{ \"a\" : 12 }")); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); t.reset();/* w w w . jav a2 s .com*/ assertTrue(t.incrementToken()); assertTrue(termAtt.toString().startsWith("a:")); t.end(); t.close(); }
From source file:com.sindicetech.siren.analysis.TestMockSirenAnalyzer.java
License:Open Source License
@Test public void testMockSirenAnalyzer() throws IOException { final MockSirenDocument doc = doc(token("aaa", node(1)), token("aaa", node(1, 0)), token("aaa", node(1))); final MockSirenAnalyzer analyzer = new MockSirenAnalyzer(); final TokenStream ts = analyzer.tokenStream("", new MockSirenReader(doc)); assertTrue(ts.incrementToken());/* www. j av a 2 s . c o m*/ assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString()); assertEquals(node(1), ts.getAttribute(NodeAttribute.class).node()); assertEquals(0, ts.getAttribute(PositionAttribute.class).position()); assertTrue(ts.incrementToken()); assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString()); assertEquals(node(1), ts.getAttribute(NodeAttribute.class).node()); assertEquals(1, ts.getAttribute(PositionAttribute.class).position()); assertTrue(ts.incrementToken()); assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString()); assertEquals(node(1, 0), ts.getAttribute(NodeAttribute.class).node()); assertEquals(0, ts.getAttribute(PositionAttribute.class).position()); }
From source file:com.sindicetech.siren.solr.analysis.BaseSirenStreamTestCase.java
License:Open Source License
public void assertTokenStreamContents(final TokenStream stream, final String[] expectedImages) throws Exception { assertTrue("has TermAttribute", stream.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); stream.reset();//from ww w.j a v a2 s . c o m for (int i = 0; i < expectedImages.length; i++) { stream.clearAttributes(); assertTrue("token " + i + " does not exists", stream.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); } assertFalse("end of stream", stream.incrementToken()); stream.end(); stream.close(); }
From source file:com.stratio.cassandra.index.query.Condition.java
License:Apache License
protected String analyze(String field, String value, ColumnMapper<?> columnMapper) { TokenStream source = null; try {// w w w. ja va2 s . com Analyzer analyzer = columnMapper.analyzer(); source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilderTest.java
License:Apache License
private List<String> analyze(String value, Analyzer analyzer) { List<String> result = new ArrayList<>(); TokenStream stream = null; try {/* w w w . j a va2 s . c om*/ stream = analyzer.tokenStream(null, value); stream.reset(); while (stream.incrementToken()) { String analyzedValue = stream.getAttribute(CharTermAttribute.class).toString(); result.add(analyzedValue); } } catch (Exception e) { throw new RuntimeException(e); } finally { IOUtils.closeWhileHandlingException(stream); } return result; }
From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java
License:Apache License
public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException { if (field == null) return new String[0]; if (!(field.getValue() instanceof String)) return new String[0]; //<start id="mahout.bayes.tokenize"/> String input = (String) field.getValue(); ArrayList<String> tokenList = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input)); while (ts.incrementToken()) { tokenList.add(ts.getAttribute(CharTermAttribute.class).toString()); }/* ww w . j a va 2 s. c om*/ String[] tokens = tokenList.toArray(new String[tokenList.size()]); //<end id="mahout.bayes.tokenize"/> return tokens; }
From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java
License:Apache License
public static void main(String[] args) { log.info("Command-line arguments: " + Arrays.toString(args)); DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("Input file").withShortName("i").create(); Option modelOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create()) .withDescription("Model to use when classifying data").withShortName("m").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create();//from ww w. ja va 2 s . c o m Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File inputFile = new File(cmdLine.getValue(inputOpt).toString()); if (!inputFile.isFile()) { throw new IllegalArgumentException(inputFile + " does not exist or is not a file"); } File modelDir = new File(cmdLine.getValue(modelOpt).toString()); if (!modelDir.isDirectory()) { throw new IllegalArgumentException(modelDir + " does not exist or is not a directory"); } BayesParameters p = new BayesParameters(); p.set("basePath", modelDir.getCanonicalPath()); Datastore ds = new InMemoryBayesDatastore(p); Algorithm a = new BayesAlgorithm(); ClassifierContext ctx = new ClassifierContext(a, ds); ctx.initialize(); //TODO: make the analyzer configurable StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream ts = analyzer.tokenStream(null, new InputStreamReader(new FileInputStream(inputFile), "UTF-8")); ArrayList<String> tokens = new ArrayList<String>(1000); while (ts.incrementToken()) { tokens.add(ts.getAttribute(CharTermAttribute.class).toString()); } String[] document = tokens.toArray(new String[tokens.size()]); ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5); for (ClassifierResult r : cr) { System.err.println(r.getLabel() + "\t" + r.getScore()); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (IOException e) { log.error("IOException", e); } catch (InvalidDatastoreException e) { log.error("InvalidDataStoreException", e); } finally { } }
From source file:com.tamingtext.qa.QuestionQParser.java
License:Apache License
@Override public Query parse() throws ParseException { //<start id="qqp.parse"/> Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/> /*/* ww w.j a v a2 s . co m*/ <calloutlist> <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>. The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout> </calloutlist> */ //<end id="qqp.parse"/> //<start id="qqp.answerType"/> String type = atc.computeAnswerType(parse); String mt = atm.get(type); //<end id="qqp.answerType"/> String field = params.get(QUERY_FIELD); SchemaField sp = req.getSchema().getFieldOrNull(field); if (sp == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field); } //<start id="qqp.query"/> List<SpanQuery> sql = new ArrayList<SpanQuery>(); if (mt != null) {//<co id="qqp.handleAT"/> String[] parts = mt.split("\\|"); if (parts.length == 1) { sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase()))); } else { for (int pi = 0; pi < parts.length; pi++) { sql.add(new SpanTermQuery(new Term(field, parts[pi]))); } } } try { Analyzer analyzer = sp.getType().getQueryAnalyzer(); TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr)); while (ts.incrementToken()) {//<co id="qqp.addTerms"/> String term = ((CharTermAttribute) ts.getAttribute(CharTermAttribute.class)).toString(); sql.add(new SpanTermQuery(new Term(field, term))); } } catch (IOException e) { throw new ParseException(e.getLocalizedMessage()); } return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(QAParams.SLOP, 10), true);//<co id="qqp.spanNear"/> /* <calloutlist> <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout> <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout> <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout> </calloutlist> */ //<end id="qqp.query"/> }
From source file:com.test.IKAnalyzerDemo.java
License:Apache License
public void test() throws IOException { String text = "java???"; //? /* w w w. j av a 2 s . c o m*/ Analyzer anal = new IKAnalyzer(true); StringReader reader = new StringReader(text); //? TokenStream ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); //???? while (ts.incrementToken()) { System.out.print(term.toString() + "|"); } reader.close(); System.out.println(); }