Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));
    stream.reset();//from   www  . j  a  v a  2  s .com

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }

        System.out.print("[" + // #E
                term + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    stream.close();
    System.out.println();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));
    stream.reset();/*from   w  w w . j a v a  2 s  .  c om*/
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.toString());
    }
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();/*  w w w  . j a  va  2  s  .co  m*/
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }
    stream.close();
}

From source file:com.sxc.lucene.analysis.synonym.SynonymAnalyzerTest.java

License:Apache License

public void testJumps() throws Exception {
    TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A
            new StringReader("jumps")); // #A
    stream.reset();//from   ww  w . j  ava2s .c  o m
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int i = 0;
    String[] expected = new String[] { "jumps", // #B
            "hops", // #B
            "leaps" }; // #B
    while (stream.incrementToken()) {
        assertEquals(expected[i], term.toString());

        int expectedPos; // #C
        if (i == 0) { // #C
            expectedPos = 1; // #C
        } else { // #C
            expectedPos = 0; // #C
        } // #C
        assertEquals(expectedPos, // #C
                posIncr.getPositionIncrement()); // #C
        i++;
    }
    stream.close();
    assertEquals(3, i);
}

From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException {
    if (field == null)
        return new String[0];
    if (!(field.getValue() instanceof String))
        return new String[0];
    //<start id="mahout.bayes.tokenize"/>
    String input = (String) field.getValue();

    ArrayList<String> tokenList = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }/* www.j a  v a2s .c  o m*/
    String[] tokens = tokenList.toArray(new String[tokenList.size()]);
    //<end id="mahout.bayes.tokenize"/>
    return tokens;
}

From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java

License:Apache License

public static void main(String[] args) {
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("Input file").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("Model to use when classifying data").withShortName("m").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();//ww w  .  j a  v a 2  s  . c om

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputFile = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputFile.isFile()) {
            throw new IllegalArgumentException(inputFile + " does not exist or is not a file");
        }

        File modelDir = new File(cmdLine.getValue(modelOpt).toString());

        if (!modelDir.isDirectory()) {
            throw new IllegalArgumentException(modelDir + " does not exist or is not a directory");
        }

        BayesParameters p = new BayesParameters();
        p.set("basePath", modelDir.getCanonicalPath());
        Datastore ds = new InMemoryBayesDatastore(p);
        Algorithm a = new BayesAlgorithm();
        ClassifierContext ctx = new ClassifierContext(a, ds);
        ctx.initialize();

        //TODO: make the analyzer configurable
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream ts = analyzer.tokenStream(null,
                new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));

        ArrayList<String> tokens = new ArrayList<String>(1000);
        while (ts.incrementToken()) {
            tokens.add(ts.getAttribute(CharTermAttribute.class).toString());
        }
        String[] document = tokens.toArray(new String[tokens.size()]);

        ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5);

        for (ClassifierResult r : cr) {
            System.err.println(r.getLabel() + "\t" + r.getScore());
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("IOException", e);
    } catch (InvalidDatastoreException e) {
        log.error("InvalidDataStoreException", e);
    } finally {

    }
}

From source file:com.tamingtext.qa.QuestionQParser.java

License:Apache License

@Override
public Query parse() throws ParseException {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*  w  ww  .j  a  v  a 2s. co  m*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    //<end id="qqp.answerType"/>
    String field = params.get(QUERY_FIELD);
    SchemaField sp = req.getSchema().getFieldOrNull(field);
    if (sp == null) {
        throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
    }
    //<start id="qqp.query"/>
    List<SpanQuery> sql = new ArrayList<SpanQuery>();
    if (mt != null) {//<co id="qqp.handleAT"/>
        String[] parts = mt.split("\\|");
        if (parts.length == 1) {
            sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
        } else {
            for (int pi = 0; pi < parts.length; pi++) {
                sql.add(new SpanTermQuery(new Term(field, parts[pi])));
            }
        }
    }
    try {
        Analyzer analyzer = sp.getType().getQueryAnalyzer();
        TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
        while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
            String term = ((CharTermAttribute) ts.getAttribute(CharTermAttribute.class)).toString();
            sql.add(new SpanTermQuery(new Term(field, term)));
        }
    } catch (IOException e) {
        throw new ParseException(e.getLocalizedMessage());
    }
    return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(QAParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
    /*
    <calloutlist>
        <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
        <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
        <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
    </calloutlist>
    */
    //<end id="qqp.query"/>
}

From source file:com.test.IKAnalyzerDemo.java

License:Apache License

public void test() throws IOException {
    String text = "java???";
    //?  /*www .  j  a  v a 2 s.c  o m*/
    Analyzer anal = new IKAnalyzer(true);
    StringReader reader = new StringReader(text);
    //?  
    TokenStream ts = anal.tokenStream("", reader);
    CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
    //????  
    while (ts.incrementToken()) {
        System.out.print(term.toString() + "|");
    }
    reader.close();
    System.out.println();
}

From source file:com.tuplejump.stargate.lucene.query.Condition.java

License:Apache License

protected String analyze(String field, String value, Analyzer analyzer) {
    TokenStream source = null;
    try {/*from   www .  ja  v  a 2 s  . co  m*/
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.twitter.common.text.util.TokenStreamSerializer.java

License:Apache License

/**
 * Same as above but serializers a lucene TwitterTokenStream.
 *//*from   w ww  .j a  va  2 s  .c o  m*/
public final byte[] serialize(final org.apache.lucene.analysis.TokenStream tokenStream) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    AttributeOutputStream output = new AttributeOutputStream(baos);

    for (AttributeSerializer serializer : attributeSerializers) {
        serializer.initialize(tokenStream, CURRENT_VERSION);
    }

    int numTokens = 0;

    while (tokenStream.incrementToken()) {
        serializeAttributes(output);
        numTokens++;
    }

    output.flush();

    byte[] data = baos.toByteArray();
    baos.close();
    baos = new ByteArrayOutputStream(8 + data.length);
    output = new AttributeOutputStream(baos);
    output.writeVInt(CURRENT_VERSION.ordinal());
    output.writeInt(attributeSerializersFingerprint);
    output.writeVInt(numTokens);
    output.write(data);
    output.flush();

    return baos.toByteArray();
}