Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:com.tuplejump.stargate.lucene.query.Condition.java

License:Apache License

protected String analyze(String field, String value, Analyzer analyzer) {
    TokenStream source = null;
    try {//  w  ww  .  ja v a  2 s .c o m
        source = analyzer.tokenStream(field, value);
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken()) {
            return null;
        }
        termAtt.fillBytesRef();
        if (source.incrementToken()) {
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + value);
        }
        source.end();
        return BytesRef.deepCopyOf(bytes).utf8ToString();
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + value, e);
    } finally {
        IOUtils.closeWhileHandlingException(source);
    }
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(/*  w  w w . ja  v a  2s .c om*/
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.weclay.ksearch2.BasicKoreanAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // text to tokenize
    //final String text = "  ? ?";
    //String text = " ,?, ?";
    String text = " ??.   . DB ?  ? ?? , ? ? , , ?,  ? ... ? ?  ? ? ?.";

    BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

    stream.reset();/*from  w ww .  ja  v  a  2 s .  c  o  m*/

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt + ": " + termAtt.length() + " (" + offsetAtt.startOffset() + ":"
                + offsetAtt.endOffset() + ")");
    }

    stream.end();
    stream.close();
}

From source file:com.wonders.xlab.healthcloud.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?truesmart??
    Analyzer analyzer = new IKAnalyzer(false);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {//w  ww .  j av  a 2  s . co m
        ts = analyzer.tokenStream("myfield",
                new StringReader("??????"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            //???
            if (term.toString().length() == 1) {
                continue;
            }
            //?????

            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {/*ww  w  .j  a va2 s  .co m*/
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {/*from  w  w w  . j  av a  2s .  com*/
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {/*w w w .j av a 2s.  c o m*/
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

/**
 * We intentionally disable the positionIncrement because we want phrases to match across removed stop words.
 *
 * @see PositionIncrementAttribute/*from  w w w  . j  a v a 2s  .  c  o m*/
 */
@Test
public void positionIncrement() throws Exception {
    TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT,
            new StringReader("It's a test."));
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1);
    }
    stream.end();
    stream.close();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//  w w  w . ja  va 2 s .co m
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }
    stream.end();
    return result;
}

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {// w w  w . j a  v  a  2  s. c o m
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}