List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.sxc.lucene.analysis.AnalyzerUtils.java
License:Apache License
public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", // #A new StringReader(text)); stream.reset();//from www . j a v a 2 s .com CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B PositionIncrementAttribute posIncr = // #B stream.addAttribute(PositionIncrementAttribute.class); // #B OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B int position = 0; while (stream.incrementToken()) { // #C int increment = posIncr.getPositionIncrement(); // #D if (increment > 0) { // #D position = position + increment; // #D System.out.println(); // #D System.out.print(position + ": "); // #D } System.out.print("[" + // #E term + ":" + // #E offset.startOffset() + "->" + // #E offset.endOffset() + ":" + // #E type.type() + "] "); // #E } stream.close(); System.out.println(); }
From source file:com.sxc.lucene.analysis.AnalyzerUtils.java
License:Apache License
public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception { TokenStream stream = analyzer.tokenStream("field", new StringReader(input)); stream.reset();/*from w w w . j a v a 2 s . c om*/ CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); for (String expected : output) { Assert.assertTrue(stream.incrementToken()); Assert.assertEquals(expected, termAttr.toString()); } Assert.assertFalse(stream.incrementToken()); stream.close(); }
From source file:com.sxc.lucene.analysis.AnalyzerUtils.java
License:Apache License
public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); stream.reset();/* w w w . j a va 2 s .co m*/ PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { System.out.println("posIncr=" + posIncr.getPositionIncrement()); } stream.close(); }
From source file:com.sxc.lucene.analysis.synonym.SynonymAnalyzerTest.java
License:Apache License
public void testJumps() throws Exception { TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A new StringReader("jumps")); // #A stream.reset();//from ww w . j ava2s .c o m CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); int i = 0; String[] expected = new String[] { "jumps", // #B "hops", // #B "leaps" }; // #B while (stream.incrementToken()) { assertEquals(expected[i], term.toString()); int expectedPos; // #C if (i == 0) { // #C expectedPos = 1; // #C } else { // #C expectedPos = 0; // #C } // #C assertEquals(expectedPos, // #C posIncr.getPositionIncrement()); // #C i++; } stream.close(); assertEquals(3, i); }
From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java
License:Apache License
public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException { if (field == null) return new String[0]; if (!(field.getValue() instanceof String)) return new String[0]; //<start id="mahout.bayes.tokenize"/> String input = (String) field.getValue(); ArrayList<String> tokenList = new ArrayList<String>(); TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input)); while (ts.incrementToken()) { tokenList.add(ts.getAttribute(CharTermAttribute.class).toString()); }/* www.j a v a2s .c o m*/ String[] tokens = tokenList.toArray(new String[tokenList.size()]); //<end id="mahout.bayes.tokenize"/> return tokens; }
From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java
License:Apache License
public static void main(String[] args) { log.info("Command-line arguments: " + Arrays.toString(args)); DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("Input file").withShortName("i").create(); Option modelOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create()) .withDescription("Model to use when classifying data").withShortName("m").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create();//ww w . j a v a 2 s . c om Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File inputFile = new File(cmdLine.getValue(inputOpt).toString()); if (!inputFile.isFile()) { throw new IllegalArgumentException(inputFile + " does not exist or is not a file"); } File modelDir = new File(cmdLine.getValue(modelOpt).toString()); if (!modelDir.isDirectory()) { throw new IllegalArgumentException(modelDir + " does not exist or is not a directory"); } BayesParameters p = new BayesParameters(); p.set("basePath", modelDir.getCanonicalPath()); Datastore ds = new InMemoryBayesDatastore(p); Algorithm a = new BayesAlgorithm(); ClassifierContext ctx = new ClassifierContext(a, ds); ctx.initialize(); //TODO: make the analyzer configurable StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream ts = analyzer.tokenStream(null, new InputStreamReader(new FileInputStream(inputFile), "UTF-8")); ArrayList<String> tokens = new ArrayList<String>(1000); while (ts.incrementToken()) { tokens.add(ts.getAttribute(CharTermAttribute.class).toString()); } String[] document = tokens.toArray(new String[tokens.size()]); ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5); for (ClassifierResult r : cr) { System.err.println(r.getLabel() + "\t" + r.getScore()); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (IOException e) { log.error("IOException", e); } catch (InvalidDatastoreException e) { log.error("InvalidDataStoreException", e); } finally { } }
From source file:com.tamingtext.qa.QuestionQParser.java
License:Apache License
@Override public Query parse() throws ParseException { //<start id="qqp.parse"/> Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/> /*/* w ww .j a v a 2s. co m*/ <calloutlist> <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>. The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout> </calloutlist> */ //<end id="qqp.parse"/> //<start id="qqp.answerType"/> String type = atc.computeAnswerType(parse); String mt = atm.get(type); //<end id="qqp.answerType"/> String field = params.get(QUERY_FIELD); SchemaField sp = req.getSchema().getFieldOrNull(field); if (sp == null) { throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field); } //<start id="qqp.query"/> List<SpanQuery> sql = new ArrayList<SpanQuery>(); if (mt != null) {//<co id="qqp.handleAT"/> String[] parts = mt.split("\\|"); if (parts.length == 1) { sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase()))); } else { for (int pi = 0; pi < parts.length; pi++) { sql.add(new SpanTermQuery(new Term(field, parts[pi]))); } } } try { Analyzer analyzer = sp.getType().getQueryAnalyzer(); TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr)); while (ts.incrementToken()) {//<co id="qqp.addTerms"/> String term = ((CharTermAttribute) ts.getAttribute(CharTermAttribute.class)).toString(); sql.add(new SpanTermQuery(new Term(field, term))); } } catch (IOException e) { throw new ParseException(e.getLocalizedMessage()); } return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(QAParams.SLOP, 10), true);//<co id="qqp.spanNear"/> /* <calloutlist> <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout> <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout> <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout> </calloutlist> */ //<end id="qqp.query"/> }
From source file:com.test.IKAnalyzerDemo.java
License:Apache License
public void test() throws IOException { String text = "java???"; //? /*www . j a v a 2 s.c o m*/ Analyzer anal = new IKAnalyzer(true); StringReader reader = new StringReader(text); //? TokenStream ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); //???? while (ts.incrementToken()) { System.out.print(term.toString() + "|"); } reader.close(); System.out.println(); }
From source file:com.tuplejump.stargate.lucene.query.Condition.java
License:Apache License
protected String analyze(String field, String value, Analyzer analyzer) { TokenStream source = null; try {/*from www . ja v a 2 s . co m*/ source = analyzer.tokenStream(field, value); source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) { return null; } termAtt.fillBytesRef(); if (source.incrementToken()) { throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value); } source.end(); return BytesRef.deepCopyOf(bytes).utf8ToString(); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + value, e); } finally { IOUtils.closeWhileHandlingException(source); } }
From source file:com.twitter.common.text.util.TokenStreamSerializer.java
License:Apache License
/** * Same as above but serializers a lucene TwitterTokenStream. *//*from w ww .j a va 2 s .c o m*/ public final byte[] serialize(final org.apache.lucene.analysis.TokenStream tokenStream) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); AttributeOutputStream output = new AttributeOutputStream(baos); for (AttributeSerializer serializer : attributeSerializers) { serializer.initialize(tokenStream, CURRENT_VERSION); } int numTokens = 0; while (tokenStream.incrementToken()) { serializeAttributes(output); numTokens++; } output.flush(); byte[] data = baos.toByteArray(); baos.close(); baos = new ByteArrayOutputStream(8 + data.length); output = new AttributeOutputStream(baos); output.writeVInt(CURRENT_VERSION.ordinal()); output.writeInt(attributeSerializersFingerprint); output.writeVInt(numTokens); output.write(data); output.flush(); return baos.toByteArray(); }