Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.sindicetech.siren.analysis.NodeAnalyzerTestCase.java

License:Open Source License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode,
        final int[] expectedPos) throws Exception {
    final TokenStream t = a.tokenStream("", new StringReader(input));
    t.reset();//from w w  w.  java 2s.  c o m

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    NodeAttribute nodeAtt = null;
    if (expectedNode != null) {
        assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class));
        nodeAtt = t.getAttribute(NodeAttribute.class);
    }

    PositionAttribute posAtt = null;
    if (expectedPos != null) {
        assertTrue("has PositionAttribute", t.hasAttribute(PositionAttribute.class));
        posAtt = t.getAttribute(PositionAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals("i=" + i, expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedNode != null) {
            assertEquals(expectedNode[i], nodeAtt.node());
        }

        if (expectedPos != null) {
            assertEquals(expectedPos[i], posAtt.position());
        }
    }

    assertFalse("end of stream, received token " + termAtt.toString(), t.incrementToken());
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.TestConciseJsonAnalyzer.java

License:Open Source License

@Test
public void testNumeric() throws Exception {
    _a.registerDatatype(XSDDatatype.XSD_LONG.toCharArray(), new LongNumericAnalyzer(64));
    final TokenStream t = _a.tokenStream("", new StringReader("{ \"a\" : 12 }"));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.reset();/* w  w w  .  jav a2  s .com*/
    assertTrue(t.incrementToken());
    assertTrue(termAtt.toString().startsWith("a:"));
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.TestMockSirenAnalyzer.java

License:Open Source License

@Test
public void testMockSirenAnalyzer() throws IOException {
    final MockSirenDocument doc = doc(token("aaa", node(1)), token("aaa", node(1, 0)), token("aaa", node(1)));
    final MockSirenAnalyzer analyzer = new MockSirenAnalyzer();
    final TokenStream ts = analyzer.tokenStream("", new MockSirenReader(doc));

    assertTrue(ts.incrementToken());/*  www. j av a  2  s  . c  o  m*/
    assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(node(1), ts.getAttribute(NodeAttribute.class).node());
    assertEquals(0, ts.getAttribute(PositionAttribute.class).position());

    assertTrue(ts.incrementToken());
    assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(node(1), ts.getAttribute(NodeAttribute.class).node());
    assertEquals(1, ts.getAttribute(PositionAttribute.class).position());

    assertTrue(ts.incrementToken());
    assertEquals("aaa", ts.getAttribute(CharTermAttribute.class).toString());
    assertEquals(node(1, 0), ts.getAttribute(NodeAttribute.class).node());
    assertEquals(0, ts.getAttribute(PositionAttribute.class).position());

}

From source file:com.sindicetech.siren.solr.analysis.BaseSirenStreamTestCase.java

License:Open Source License

public void assertTokenStreamContents(final TokenStream stream, final String[] expectedImages)
        throws Exception {
    assertTrue("has TermAttribute", stream.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);

    stream.reset();//from ww  w.j a v  a2  s .  c  o m
    for (int i = 0; i < expectedImages.length; i++) {
        stream.clearAttributes();
        assertTrue("token " + i + " does not exists", stream.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());
    }

    assertFalse("end of stream", stream.incrementToken());
    stream.end();
    stream.close();
}

From source file:com.stratio.cassandra.index.query.Condition.java

License:Apache License

protected String analyze(String field, String value, ColumnMapper<?> columnMapper) {
    TokenStream source = null;
    try {// w  w  w. ja  va2 s .  com
        Analyzer analyzer = columnMapper.analyzer();
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilderTest.java

License:Apache License

private List<String> analyze(String value, Analyzer analyzer) {
    List<String> result = new ArrayList<>();
    TokenStream stream = null;
    try {/* w w w . j  a  va2 s  . c om*/
        stream = analyzer.tokenStream(null, value);
        stream.reset();
        while (stream.incrementToken()) {
            String analyzedValue = stream.getAttribute(CharTermAttribute.class).toString();
            result.add(analyzedValue);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return result;
}

From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java

License:Apache License

public String[] tokenizeField(String fieldName, SolrInputField field) throws IOException {
    if (field == null)
        return new String[0];
    if (!(field.getValue() instanceof String))
        return new String[0];
    //<start id="mahout.bayes.tokenize"/>
    String input = (String) field.getValue();

    ArrayList<String> tokenList = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(inputField, new StringReader(input));
    while (ts.incrementToken()) {
        tokenList.add(ts.getAttribute(CharTermAttribute.class).toString());
    }/*  ww  w  . j a va 2  s. c om*/
    String[] tokens = tokenList.toArray(new String[tokenList.size()]);
    //<end id="mahout.bayes.tokenize"/>
    return tokens;
}

From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java

License:Apache License

public static void main(String[] args) {
    log.info("Command-line arguments: " + Arrays.toString(args));

    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("Input file").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create())
            .withDescription("Model to use when classifying data").withShortName("m").create();

    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();//from ww w.  ja va  2  s  . c o m

    Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt)
            .create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        File inputFile = new File(cmdLine.getValue(inputOpt).toString());

        if (!inputFile.isFile()) {
            throw new IllegalArgumentException(inputFile + " does not exist or is not a file");
        }

        File modelDir = new File(cmdLine.getValue(modelOpt).toString());

        if (!modelDir.isDirectory()) {
            throw new IllegalArgumentException(modelDir + " does not exist or is not a directory");
        }

        BayesParameters p = new BayesParameters();
        p.set("basePath", modelDir.getCanonicalPath());
        Datastore ds = new InMemoryBayesDatastore(p);
        Algorithm a = new BayesAlgorithm();
        ClassifierContext ctx = new ClassifierContext(a, ds);
        ctx.initialize();

        //TODO: make the analyzer configurable
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream ts = analyzer.tokenStream(null,
                new InputStreamReader(new FileInputStream(inputFile), "UTF-8"));

        ArrayList<String> tokens = new ArrayList<String>(1000);
        while (ts.incrementToken()) {
            tokens.add(ts.getAttribute(CharTermAttribute.class).toString());
        }
        String[] document = tokens.toArray(new String[tokens.size()]);

        ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5);

        for (ClassifierResult r : cr) {
            System.err.println(r.getLabel() + "\t" + r.getScore());
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    } catch (IOException e) {
        log.error("IOException", e);
    } catch (InvalidDatastoreException e) {
        log.error("InvalidDataStoreException", e);
    } finally {

    }
}

From source file:com.tamingtext.qa.QuestionQParser.java

License:Apache License

@Override
public Query parse() throws ParseException {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*/*  ww w.j  a v  a2 s . co m*/
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    //<end id="qqp.answerType"/>
    String field = params.get(QUERY_FIELD);
    SchemaField sp = req.getSchema().getFieldOrNull(field);
    if (sp == null) {
        throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
    }
    //<start id="qqp.query"/>
    List<SpanQuery> sql = new ArrayList<SpanQuery>();
    if (mt != null) {//<co id="qqp.handleAT"/>
        String[] parts = mt.split("\\|");
        if (parts.length == 1) {
            sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
        } else {
            for (int pi = 0; pi < parts.length; pi++) {
                sql.add(new SpanTermQuery(new Term(field, parts[pi])));
            }
        }
    }
    try {
        Analyzer analyzer = sp.getType().getQueryAnalyzer();
        TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
        while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
            String term = ((CharTermAttribute) ts.getAttribute(CharTermAttribute.class)).toString();
            sql.add(new SpanTermQuery(new Term(field, term)));
        }
    } catch (IOException e) {
        throw new ParseException(e.getLocalizedMessage());
    }
    return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(QAParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
    /*
    <calloutlist>
        <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
        <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
        <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
    </calloutlist>
    */
    //<end id="qqp.query"/>
}

From source file:com.test.IKAnalyzerDemo.java

License:Apache License

public void test() throws IOException {
    String text = "java???";
    //?  /*  w  w  w.  j  av  a  2  s . c  o  m*/
    Analyzer anal = new IKAnalyzer(true);
    StringReader reader = new StringReader(text);
    //?  
    TokenStream ts = anal.tokenStream("", reader);
    CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
    //????  
    while (ts.incrementToken()) {
        System.out.print(term.toString() + "|");
    }
    reader.close();
    System.out.println();
}