Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }//  w  w  w.j ava2s .  c om
    stream.close();
}

From source file:com.sxc.lucene.analysis.synonym.SynonymAnalyzerTest.java

License:Apache License

public void testJumps() throws Exception {
    TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A
            new StringReader("jumps")); // #A
    stream.reset();
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int i = 0;//  w  w  w  . ja v  a 2  s .c o m
    String[] expected = new String[] { "jumps", // #B
            "hops", // #B
            "leaps" }; // #B
    while (stream.incrementToken()) {
        assertEquals(expected[i], term.toString());

        int expectedPos; // #C
        if (i == 0) { // #C
            expectedPos = 1; // #C
        } else { // #C
            expectedPos = 0; // #C
        } // #C
        assertEquals(expectedPos, // #C
                posIncr.getPositionIncrement()); // #C
        i++;
    }
    stream.close();
    assertEquals(3, i);
}

From source file:com.tuplejump.stargate.lucene.query.Condition.java

License:Apache License

protected String analyze(String field, String value, Analyzer analyzer) {
    TokenStream source = null;
    try {/* ww  w. ja v a  2 s .c  om*/
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(/*w  ww .  j a v  a2  s  .  c  om*/
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.weclay.ksearch2.BasicKoreanAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // text to tokenize
    //final String text = "  ? ?";
    //String text = " ,?, ?";
    String text = " ??.   . DB ?  ? ?? , ? ? , , ?,  ? ... ? ?  ? ? ?.";

    BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

    stream.reset();

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt + ": " + termAtt.length() + " (" + offsetAtt.startOffset() + ":"
                + offsetAtt.endOffset() + ")");
    }/* w  w w  .  j  av  a2s. c o  m*/

    stream.end();
    stream.close();
}

From source file:com.wiseowl.WiseOwl.query.WiseOwlQParser.java

License:Apache License

@Override
public Query parse() throws SyntaxError {

    //<start id="qqp.parse"/>
    Parse parse = ParserTool.parseLine(qstr, parser, 1)[0];//<co id="qqp.parseLine"/>
    /*//from   www  .  j  a  va 2  s .  c o  m
    <calloutlist>
        <callout arearefs="qqp.parseLine"><para>Parse the question using the <classname>TreebankParser</classname>.  The resulting <classname>Parse</classname> object can then be utilized by the classifier to determine the Answer Type.</para></callout>
    </calloutlist>
    */
    //<end id="qqp.parse"/>
    //<start id="qqp.answerType"/>
    // String type = "P";
    String type = atc.computeAnswerType(parse);
    String mt = atm.get(type);
    if (mt.equals("DESCRIPTION")) {
        BooleanQuery bq;
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        //BooleanQuery bq=new BooleanQuery(false, 0);
        String field = "text";
        SchemaField sf = req.getSchema().getFieldOrNull(field);
        try {
            Analyzer analyzer = sf.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;
            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                //ts.reset();
                //log.warn("terms {} ",term);
                builder.add(new TermQuery(new Term(field, term)), BooleanClause.Occur.SHOULD);
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        bq = builder.build();
        return bq;
        //return new TermQuery(new Term("title", "she"));

    } else {
        //<end id="qqp.answerType"/>
        String field = "text";
        //params.get(QUERY_FIELD);
        //String field="text";
        SchemaField sp = req.getSchema().getFieldOrNull(field);
        if (sp == null) {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Undefined field: " + field);
        }
        //<start id="qqp.query"/>
        List<SpanQuery> sql = new ArrayList<SpanQuery>();
        if (mt != null) {//<co id="qqp.handleAT"/>
            String[] parts = mt.split("\\|");
            if (parts.length == 1) {
                sql.add(new SpanTermQuery(new Term(field, mt.toLowerCase())));
            } else {
                for (int pi = 0; pi < parts.length; pi++) {
                    sql.add(new SpanTermQuery(new Term(field, parts[pi].toLowerCase())));
                }
            }
        }
        log.warn("answer type mt : {} {} ", mt, type);
        FocusNoun fn = new FocusNoun();
        String fnn[] = null;
        try {
            fnn = fn.getFocusNoun(qstr);
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
        try {
            Analyzer analyzer = sp.getType().getQueryAnalyzer();
            TokenStream ts = analyzer.tokenStream(field, new StringReader(qstr));
            ts.reset();
            CharTermAttribute tok = null;

            while (ts.incrementToken()) {//<co id="qqp.addTerms"/>
                tok = ts.getAttribute(CharTermAttribute.class);
                String term = tok.toString();
                log.warn("terms boosted {} ", term);
                if (fnn != null)
                    if (term.equals(fnn[0]) || term.equals(fnn[1])) {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 100f));
                    } else {
                        SpanQuery sq = new SpanTermQuery(new Term(field, term));
                        sql.add(new SpanBoostQuery(sq, 5f));
                    }

                // sql.add(new SpanTermQuery(new Term(field, term)));
            }
            ts.close();
        } catch (IOException e) {
            throw new SyntaxError(e.getLocalizedMessage());
        }
        return new SpanOrQuery(sql.toArray(new SpanQuery[sql.size()]));
        // return new SpanNearQuery(sql.toArray(new SpanQuery[sql.size()]), params.getInt(OWLParams.SLOP, 10), true);//<co id="qqp.spanNear"/>
        /*
        <calloutlist>
            <callout arearefs="qqp.handleAT"><para>Add the AnswerType to the query</para></callout>
            <callout arearefs="qqp.addTerms"><para>Add the original query terms to the query</para></callout>
            <callout arearefs="qqp.spanNear"><para>Query the index looking for all of the parts near each other</para></callout>
        </calloutlist>
        */
        //<end id="qqp.query"/>

    }
}

From source file:com.wonders.xlab.healthcloud.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?truesmart??
    Analyzer analyzer = new IKAnalyzer(false);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {/*  w  w w.  ja va2 s  . co  m*/
        ts = analyzer.tokenStream("myfield",
                new StringReader("??????"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            //???
            if (term.toString().length() == 1) {
                continue;
            }
            //?????

            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.xiaomi.linden.lucene.analyzer.CommonMMSeg4jSegmenter.java

License:Apache License

@Override
public List<Term> parse(String content) throws Exception {
    List<Term> words = new ArrayList<>();
    if (content == null || content.isEmpty()) {
        return words;
    }/* ww w  . j a v a 2 s .c om*/

    TokenStream stream = null;
    try {
        stream = analyzer.tokenStream("", content);
        stream.reset();
        if (stopWords != null) {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(new StopFilter(stream, stopWords));
            } else {
                stream = new StopFilter(stream, stopWords);
            }
        } else {
            if (cutLetterDigit) {
                stream = new CutLetterDigitFilter(stream);
            }
        }
        CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
        while (stream.incrementToken()) {
            words.add(
                    new Term(termAttr.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()));
        }
    } catch (IOException e) {
        throw new Exception(content + " extract words from phrase failed!", e);
    } finally {
        if (stream != null) {
            stream.close();
        }
    }
    return words;
}

From source file:com.xiaomi.linden.lucene.analyzer.TestLindenWordDelimiterAnalyzer.java

License:Apache License

@Test
public void testLindenWordDelimiterAnalyzer() throws Exception {
    LindenWordDelimiterAnalyzerFactory wordDelimiterAnalyzerFactory = new LindenWordDelimiterAnalyzerFactory();
    Map<String, String> args = new HashMap<>();
    Map<String, String> lastargs = new HashMap<>();
    args.put("luceneMatchVersion", "LUCENE_4_10_0");
    lastargs.putAll(args);//from  ww  w.  ja  v  a  2 s  .  c om
    Analyzer analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    TokenStream stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls sun-li-shun SunLiShun"));
    String expected = "[hello][test][case][][][][][][][][][][][][created][2018][sls][sun][li][shun][sun][li][shun]";
    String out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.put("lower.case", "false");
    args.putAll(lastargs);
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls on 20140707"));
    expected = "[Hello][test][case][][][][][][][][][][][][created][2018][sls][20140707]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.put("set.stopwords", "false");
    args.putAll(lastargs);
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls on 20140707"));
    expected = "[Hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][on][20140707]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);

    args.putAll(lastargs);
    args.put("splitOnCaseChange", "0");
    args.put("set.stopwords", "false");
    args.put("lower.case", "true");
    lastargs.putAll(args);
    analyzer = wordDelimiterAnalyzerFactory.getInstance(args);
    stream = analyzer.tokenStream("", new StringReader("Hello, this is a test case. "
            + "" + "created2018by sls sun-li-shun SunLiShun"));
    expected = "[hello][this][is][a][test][case][][][][][][][][][][][][created][2018][by][sls][sun][li][shun][sunlishun]";
    out = "";
    stream.reset();
    while (stream.incrementToken()) {
        out += "[" + stream.getAttribute(CharTermAttribute.class).toString() + "]";
    }
    Assert.assertEquals(expected, out);
}

From source file:com.xiaomi.linden.lucene.query.flexiblequery.FlexibleQuery.java

License:Apache License

private List<SegToken> parseToTokens(String content, float boost) throws IOException {
    List<SegToken> tokens = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream("", new StringReader(content));
    try {//from  w  w w  .  java2  s.  co m
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(new SegToken(term.toString(), boost));
        }
    } finally {
        if (stream != null)
            stream.close();
    }
    return tokens;
}