Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java

License:Open Source License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode,
        final int[] expectedPos) throws Exception {
    final TokenStream t = a.tokenStream("", new StringReader(input));
    t.reset();//from w w  w.  java 2s.  c o m

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    NodeAttribute nodeAtt = null;
    if (expectedNode != null) {
        assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class));
        nodeAtt = t.getAttribute(NodeAttribute.class);
    }

    PositionAttribute posAtt = null;
    if (expectedPos != null) {
        assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class));
        posAtt = t.getAttribute(PositionAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals("i=" + i, expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedNode != null) {
            assertEquals(expectedNode[i], nodeAtt.node());
        }

        if (expectedPos != null) {
            assertEquals(expectedPos[i], posAtt.position());
        }
    }

    assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken());
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.TestConciseJsonAnalyzer.java

License:Open Source License

@Test
public void testNumeric() throws Exception {
    _a.registerDatatype(XSDDatatype.XSD_LONG.toCharArray(), new LongNumericAnalyzer(64));
    final TokenStream t = _a.tokenStream("", new StringReader("{ \"a\" : 12 }"));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.reset();/* w  w w  .  jav a2  s .com*/
    assertTrue(t.incrementToken());
    assertTrue(termAtt.toString().startsWith("a:"));
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.TestMockSirenAnalyzer.java

License:Open Source License

@Test
public void testMockSirenAnalyzer() throws IOException {
    final MockSirenDocument doc = doc(token("aaa", node(1)), token("aaa", node(1, 0)), token("aaa", node(1)));
    final MockSirenAnalyzer analyzer = new MockSirenAnalyzer();
    final TokenStream ts = analyzer.tokenStream("", new MockSirenReader(doc));

    assertTrue(ts.incrementToken());/*  www. j av a  2  s  . c  o  m*/
    assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(node(1), ts.getAttribute(NodeAttribute.class).node());
    assertEquals(0, ts.getAttribute(PositionAttribute.class).position());

    assertTrue(ts.incrementToken());
    assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(node(1), ts.getAttribute(NodeAttribute.class).node());
    assertEquals(1, ts.getAttribute(PositionAttribute.class).position());

    assertTrue(ts.incrementToken());
    assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(node(1, 0), ts.getAttribute(NodeAttribute.class).node());
    assertEquals(0, ts.getAttribute(PositionAttribute.class).position());

}

From source file:com.sindicetech.siren.solr.analysis.BaseSirenStreamTestCase.java

License:Open Source License

public void assertTokenStreamContents(final TokenStream stream, final String[] expectedImages)
        throws Exception {
    assertTrue("has TermAttribute", stream.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);

    stream.reset();//from ww  w.j a v  a2  s .  c  o m
    for (int i = 0; i < expectedImages.length; i++) {
        stream.clearAttributes();
        assertTrue("token " + i + " does not exists", stream.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());
    }

    assertFalse("end of stream", stream.incrementToken());
    stream.end();
    stream.close();
}

From source file:com.stratio.cassandra.index.query.Condition.java

License:Apache License

protected String analyze(String field, String value, ColumnMapper<?> columnMapper) {
    TokenStream source = null;
    try {// w  w  w. ja  va2 s .  com
        Analyzer analyzer = columnMapper.analyzer();
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilderTest.java

License:Apache License

private List<String> analyze(String value, Analyzer analyzer) {
    List<String> result = new ArrayList<>();
    TokenStream stream = null;
    try {/* w w w . j  a  va2 s  . c om*/
        stream = analyzer.tokenStream(null, value);
        stream.reset();
        while (stream.incrementToken()) {
            String analyzedValue = stream.getAttribute(CharTermAttribute.class).toString();
            result.add(analyzedValue);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return result;
}

From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException {
    if (field == null)
        return new String[0];
    if (!(field.getValue() instanceof String))
        return new String[0];
    //<start id="mahout.bayes.tokenize"/>
    String input = (String) field.getValue();

    ArrayList<String> tokenList = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }/*  ww  w  . j a va 2  s. c om*/
    String[] tokens = tokenList.toArray(new String[tokenList.size()]);
    //<end id="mahout.bayes.tokenize"/>
    return tokens;
}

From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java

License:Apache License

public static void main(String[] args) {
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("Input file").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("Model to use when classifying data").withShortName("m").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();//from ww w.  ja va  2  s  . c o m

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputFile = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputFile.isFile()) {
            throw new IllegalArgumentException(inputFile + " does not exist or is not a file");
        }

        File modelDir = new File(cmdLine.getValue(modelOpt).toString());

        if (!modelDir.isDirectory()) {
            throw new IllegalArgumentException(modelDir + " does not exist or is not a directory");
        }

        BayesParameters p = new BayesParameters();
        p.set("basePath", modelDir.getCanonicalPath());
        Datastore ds = new InMemoryBayesDatastore(p);
        Algorithm a = new BayesAlgorithm();
        ClassifierContext ctx = new ClassifierContext(a, ds);
        ctx.initialize();

        //TODO: make the analyzer configurable
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream ts = analyzer.tokenStream(null,
                new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));

        ArrayList<String> tokens = new ArrayList<String>(1000);
        while (ts.incrementToken()) {
            tokens.add(ts.getAttribute(CharTermAttribute.class).toString());
        }
        String[] document = tokens.toArray(new String[tokens.size()]);

        ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5);

        for (ClassifierResult r : cr) {
            System.err.println(r.getLabel() + "\t" + r.getScore());
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("IOException", e);
    } catch (InvalidDatastoreException e) {
        log.error("InvalidDataStoreException", e);
    } finally {

    }
}

From source file:com.tamingtext.qa.QuestionQParser.java

License:Apache License

@Override
public Query parse() throws ParseException {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*  ww w.j  a v  a2 s . co m*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    //<end id="qqp.answerType"/>
    String field = params.get(QUERY_FIELD);
    SchemaField sp = req.getSchema().getFieldOrNull(field);
    if (sp == null) {
        throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
    }
    //<start id="qqp.query"/>
    List<SpanQuery> sql = new ArrayList<SpanQuery>();
    if (mt != null) {//<co id="qqp.handleAT"/>
        String[] parts = mt.split("\\|");
        if (parts.length == 1) {
            sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
        } else {
            for (int pi = 0; pi < parts.length; pi++) {
                sql.add(new SpanTermQuery(new Term(field, parts[pi])));
            }
        }
    }
    try {
        Analyzer analyzer = sp.getType().getQueryAnalyzer();
        TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
        while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
            String term = ((CharTermAttribute) ts.getAttribute(CharTermAttribute.class)).toString();
            sql.add(new SpanTermQuery(new Term(field, term)));
        }
    } catch (IOException e) {
        throw new ParseException(e.getLocalizedMessage());
    }
    return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(QAParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
    /*
    <calloutlist>
        <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
        <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
        <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
    </calloutlist>
    */
    //<end id="qqp.query"/>
}

From source file:com.test.IKAnalyzerDemo.java

License:Apache License

public void test() throws IOException {
    String text = "java???";
    //?  /*  w  w  w.  j  av  a  2  s . c  o  m*/
    Analyzer anal = new IKAnalyzer(true);
    StringReader reader = new StringReader(text);
    //?  
    TokenStream ts = anal.tokenStream("", reader);
    CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
    //????  
    while (ts.incrementToken()) {
        System.out.print(term.toString() + "|");
    }
    reader.close();
    System.out.println();
}