Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));
    stream.reset();//from   www  . j  a  v a  2  s .com

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }

        System.out.print("[" + // #E
                term + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    stream.close();
    System.out.println();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));
    stream.reset();/*from   w  w w . j a v a  2 s  .  c om*/
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.toString());
    }
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();/*  w w w  . j a  va  2  s  .co  m*/
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }
    stream.close();
}

From source file:com.sxc.lucene.analysis.synonym.SynonymAnalyzerTest.java

License:Apache License

public void testJumps() throws Exception {
    TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A
            new StringReader("jumps")); // #A
    stream.reset();//from   ww  w . j  ava2s .c  o m
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int i = 0;
    String[] expected = new String[] { "jumps", // #B
            "hops", // #B
            "leaps" }; // #B
    while (stream.incrementToken()) {
        assertEquals(expected[i], term.toString());

        int expectedPos; // #C
        if (i == 0) { // #C
            expectedPos = 1; // #C
        } else { // #C
            expectedPos = 0; // #C
        } // #C
        assertEquals(expectedPos, // #C
                posIncr.getPositionIncrement()); // #C
        i++;
    }
    stream.close();
    assertEquals(3, i);
}

From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException {
    if (field == null)
        return new String[0];
    if (!(field.getValue() instanceof String))
        return new String[0];
    //<start id="mahout.bayes.tokenize"/>
    String input = (String) field.getValue();

    ArrayList<String> tokenList = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }/* www.j a  v a2s .c  o m*/
    String[] tokens = tokenList.toArray(new String[tokenList.size()]);
    //<end id="mahout.bayes.tokenize"/>
    return tokens;
}

From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java

License:Apache License

public static void main(String[] args) {
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("Input file").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("Model to use when classifying data").withShortName("m").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();//ww w  .  j a  v a 2  s  . c om

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputFile = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputFile.isFile()) {
            throw new IllegalArgumentException(inputFile + " does not exist or is not a file");
        }

        File modelDir = new File(cmdLine.getValue(modelOpt).toString());

        if (!modelDir.isDirectory()) {
            throw new IllegalArgumentException(modelDir + " does not exist or is not a directory");
        }

        BayesParameters p = new BayesParameters();
        p.set("basePath", modelDir.getCanonicalPath());
        Datastore ds = new InMemoryBayesDatastore(p);
        Algorithm a = new BayesAlgorithm();
        ClassifierContext ctx = new ClassifierContext(a, ds);
        ctx.initialize();

        //TODO: make the analyzer configurable
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream ts = analyzer.tokenStream(null,
                new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));

        ArrayList<String> tokens = new ArrayList<String>(1000);
        while (ts.incrementToken()) {
            tokens.add(ts.getAttribute(CharTermAttribute.class).toString());
        }
        String[] document = tokens.toArray(new String[tokens.size()]);

        ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5);

        for (ClassifierResult r : cr) {
            System.err.println(r.getLabel() + "\t" + r.getScore());
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("IOException", e);
    } catch (InvalidDatastoreException e) {
        log.error("InvalidDataStoreException", e);
    } finally {

    }
}

From source file:com.tamingtext.qa.QuestionQParser.java

License:Apache License

@Override
public Query parse() throws ParseException {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*  w  ww  .j  a  v  a 2s. co  m*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    //<end id="qqp.answerType"/>
    String field = params.get(QUERY_FIELD);
    SchemaField sp = req.getSchema().getFieldOrNull(field);
    if (sp == null) {
        throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
    }
    //<start id="qqp.query"/>
    List<SpanQuery> sql = new ArrayList<SpanQuery>();
    if (mt != null) {//<co id="qqp.handleAT"/>
        String[] parts = mt.split("\\|");
        if (parts.length == 1) {
            sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
        } else {
            for (int pi = 0; pi < parts.length; pi++) {
                sql.add(new SpanTermQuery(new Term(field, parts[pi])));
            }
        }
    }
    try {
        Analyzer analyzer = sp.getType().getQueryAnalyzer();
        TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
        while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
            String term = ((CharTermAttribute) ts.getAttribute(CharTermAttribute.class)).toString();
            sql.add(new SpanTermQuery(new Term(field, term)));
        }
    } catch (IOException e) {
        throw new ParseException(e.getLocalizedMessage());
    }
    return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(QAParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
    /*
    <calloutlist>
        <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
        <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
        <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
    </calloutlist>
    */
    //<end id="qqp.query"/>
}

From source file:com.test.IKAnalyzerDemo.java

License:Apache License

public void test() throws IOException {
    String text = "java???";
    //?  /*www .  j  a  v a 2 s.c  o m*/
    Analyzer anal = new IKAnalyzer(true);
    StringReader reader = new StringReader(text);
    //?  
    TokenStream ts = anal.tokenStream("", reader);
    CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
    //????  
    while (ts.incrementToken()) {
        System.out.print(term.toString() + "|");
    }
    reader.close();
    System.out.println();
}

From source file:com.tuplejump.stargate.lucene.query.Condition.java

License:Apache License

protected String analyze(String field, String value, Analyzer analyzer) {
    TokenStream source = null;
    try {/*from   www .  ja  v  a 2 s  . co  m*/
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.twitter.common.text.util.TokenStreamSerializer.java

License:Apache License

/**
 * Same as above but serializers a lucene TwitterTokenStream.
 *//*from   w ww  .j a  va  2 s  .c o  m*/
public final byte[] serialize(final org.apache.lucene.analysis.TokenStream tokenStream) throws IOException {
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    AttributeOutputStream output = new AttributeOutputStream(baos);

    for (AttributeSerializer serializer : attributeSerializers) {
        serializer.initialize(tokenStream, CURRENT_VERSION);
    }

    int numTokens = 0;

    while (tokenStream.incrementToken()) {
        serializeAttributes(output);
        numTokens++;
    }

    output.flush();

    byte[] data = baos.toByteArray();
    baos.close();
    baos = new ByteArrayOutputStream(8 + data.length);
    output = new AttributeOutputStream(baos);
    output.writeVInt(CURRENT_VERSION.ordinal());
    output.writeInt(attributeSerializersFingerprint);
    output.writeVInt(numTokens);
    output.write(data);
    output.flush();

    return baos.toByteArray();
}