lia.analysis.i18n.ChineseDemo.java Source code

Introduction

Here is the source code for lia.analysis.i18n.ChineseDemo.java
Source

package lia.analysis.i18n;

/**
 * Copyright Manning Publications Co.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific lan      
*/

import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;

import java.awt.Font;
import java.awt.FontMetrics;
import java.awt.Frame;
import java.awt.Label;
import java.io.IOException;
import java.io.StringReader;

import javax.swing.JLabel;

// From chapter 4
public class ChineseDemo {
    private static String[] strings = { "?" }; //A

    private static Analyzer[] analyzers = { new SimpleAnalyzer(), new StandardAnalyzer(Version.LUCENE_30),
            new ChineseAnalyzer(), //B
            new CJKAnalyzer(Version.LUCENE_30), new SmartChineseAnalyzer(Version.LUCENE_30) };

    public static void main(String args[]) throws Exception {

        for (String string : strings) {
            for (Analyzer analyzer : analyzers) {
                analyze(string, analyzer);
            }
        }

    }

    private static void analyze(String string, Analyzer analyzer) throws IOException {
        StringBuffer buffer = new StringBuffer();

        TokenStream stream = analyzer.tokenStream("contents", new StringReader(string));
        TermAttribute term = stream.addAttribute(TermAttribute.class);

        while (stream.incrementToken()) { //C
            buffer.append("[");
            buffer.append(term.term());
            buffer.append("] ");
        }

        String output = buffer.toString();

        Frame f = new Frame();
        f.setTitle(analyzer.getClass().getSimpleName() + " : " + string);
        f.setResizable(true);

        Font font = new Font(null, Font.PLAIN, 36);
        int width = getWidth(f.getFontMetrics(font), output);

        f.setSize((width < 250) ? 250 : width + 50, 75);

        // NOTE: if Label doesn't render the Chinese characters
        // properly, try using javax.swing.JLabel instead
        JLabel label = new JLabel(output); //D
        label.setSize(width, 75);
        //label.setAlignment(JLabel.CENTER);
        label.setFont(font);
        f.add(label);

        f.setVisible(true);
    }

    private static int getWidth(FontMetrics metrics, String s) {
        int size = 0;
        int length = s.length();
        for (int i = 0; i < length; i++) {
            size += metrics.charWidth(s.charAt(i));
        }

        return size;
    }
}

/*    
#A Analyze this text
#B Test these analyzers
#C Retrieve tokens
#D Display analysis
*/