cz.muni.fi.japanesejmdictsaxparser.saxholder.SaxKanjidic2Holder.java Source code

Java tutorial

Introduction

Here is the source code for cz.muni.fi.japanesejmdictsaxparser.saxholder.SaxKanjidic2Holder.java

Source

/**
 *     JapaneseDictionary - an JMDict browser for Android
 Copyright (C) 2013 Jaroslav Klech
     
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.
     
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
    
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package cz.muni.fi.japanesejmdictsaxparser.saxholder;

import java.io.File;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;

import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Sax data holder for kanjidict2 xml.
 * 
 * @author Jaroslav Klech
 *
 */
public class SaxKanjidic2Holder extends DefaultHandler {
    final static org.slf4j.Logger log = LoggerFactory.getLogger(SaxKanjidic2Holder.class);
    private static final String LOG_TAG = "SaxDataHolderKanjiDict";
    private boolean mCanceled = false;
    private IndexWriter mWriter;
    private Document mDoc;

    private int mCountDone = 0;
    private int mPerc = 0;
    private int mPercSave = 0;
    public static final int ENTRIES_COUNT = 13150; // curently 13108

    //parsing
    private boolean mLiteral;
    private boolean mRadicalClassic;
    private boolean mGrade;
    private boolean mStrokeCount;
    private boolean mDicRef;

    private boolean mQueryCodeSkip;
    private boolean mRMGroupJaOn;
    private boolean mRMGroupJaKun;
    private boolean mMeaningEnglish;
    private boolean mMeaningFrench;
    /*
     *  dutch and german aren't in current kanjidict 2
     */
    private boolean mMeaningDutch;
    private boolean mMeaningGerman;

    private boolean mNanori;

    private JSONObject mValueDicRef;
    private String mDicRefKey;
    private JSONArray mValueRmGroupJaOn;
    private JSONArray mValueRmGroupJaKun;
    private JSONArray mValueMeaningEnglish;
    private JSONArray mValueMeaningFrench;
    /*
     *  dutch and german aren't in current kanjidict 2
     */
    private JSONArray mValueMeaningDutch;
    private JSONArray mValueMeaningGerman;

    private JSONArray mValueNanori;

    /**
     * If called with true SAXDataHolder will terminate
     * 
     * @param cancel true if canceled
     */
    public void cancel(boolean cancel) {
        this.mCanceled = cancel;
    }

    /**
     * SaxDataHolderKanjiDict constructor
     * 
     * @param androidOutputFolder lucene dictionary for saving documents
     * @throws IOException
     * @throws IllegalArgumentException if directory doesn't exist
     */
    public SaxKanjidic2Holder(File androidOutputFolder) throws IOException, IllegalArgumentException {
        if (androidOutputFolder == null) {
            log.debug(LOG_TAG + "SaxDataHolderKanjiDict - dictionary directory is null");
            throw new IllegalArgumentException("SaxDataHolderKanjiDict: dictionary directory is null");
        }
        Directory dir = FSDirectory.open(androidOutputFolder);
        Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        mWriter = new IndexWriter(dir, config);
        // windows
        log.debug(LOG_TAG + "SaxDataHolderKanjiDict created");
    }

    @Override
    public void startDocument() throws SAXException {
        log.debug(LOG_TAG + "Start of document");
        super.startDocument();
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes attributes)
            throws SAXException {
        if (mCanceled) {
            throw new SAXException("SAX terminated due to ParserService end.");
        }
        switch (qName) {

        case "character":
            mDoc = new Document();
            mValueDicRef = new JSONObject();
            mValueRmGroupJaOn = new JSONArray();
            mValueRmGroupJaKun = new JSONArray();
            mValueMeaningEnglish = new JSONArray();
            mValueMeaningFrench = new JSONArray();
            /*
             *  dutch and german aren't in current kanjidict 2
             */
            mValueMeaningDutch = new JSONArray();
            mValueMeaningGerman = new JSONArray();

            mValueNanori = new JSONArray();
            break;
        case "literal":
            mLiteral = true;
            break;
        case "rad_value":
            if ("classical".equals(attributes.getValue("rad_type"))) {
                mRadicalClassic = true;
            }
            break;
        case "grade":
            mGrade = true;
            break;
        case "stroke_count":
            mStrokeCount = true;
            break;
        case "dic_ref":
            mDicRef = true;
            mDicRefKey = attributes.getValue("dr_type");
        case "q_code":
            if ("skip".equals(attributes.getValue("qc_type"))) {
                mQueryCodeSkip = true;
            }
            break;
        case "reading":
            switch (attributes.getValue("r_type")) {
            case "ja_on":
                mRMGroupJaOn = true;
                break;
            case "ja_kun":
                mRMGroupJaKun = true;
                break;
            }
            break;
        case "meaning":
            if (attributes.getValue("m_lang") != null) {
                switch (attributes.getValue("m_lang")) {
                case "fr":
                    mMeaningFrench = true;
                    break;
                case "du":
                    mMeaningDutch = true;
                    break;
                case "ge":
                    mMeaningGerman = true;
                    break;
                default:
                    mMeaningEnglish = true;
                }
            } else {
                mMeaningEnglish = true;
            }
            break;
        case "nanori":
            mNanori = true;
            break;
        }
        super.startElement(uri, localName, qName, attributes);
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (mLiteral) {
            mDoc.add(new Field("literal", new String(ch, start, length), Field.Store.YES, Index.ANALYZED));
            mLiteral = false;
        } else if (mRadicalClassic) {
            String value = tryParseNumber(new String(ch, start, length));
            if (value != null) {
                mDoc.add(new Field("radicalClassic", value, Field.Store.YES, Index.NO));
            }
            mRadicalClassic = false;
        } else if (mGrade) {
            String value = tryParseNumber(new String(ch, start, length));
            if (value != null) {
                mDoc.add(new Field("grade", value, Field.Store.YES, Index.NO));
            }
            mGrade = false;
        } else if (mStrokeCount) {
            String value = tryParseNumber(new String(ch, start, length));
            if (value != null) {
                mDoc.add(new Field("strokeCount", value, Field.Store.YES, Index.NO));
            }
            mStrokeCount = false;
        } else if (mDicRef) {
            if (mDicRefKey != null) {
                try {
                    mValueDicRef.put(mDicRefKey, new String(ch, start, length));
                } catch (JSONException ex) {
                    Logger.getLogger(SaxKanjidic2Holder.class.getName()).log(Level.SEVERE, null, ex);
                }
                mDicRefKey = null;
                mDicRef = false;
            }
        } else if (mQueryCodeSkip) {
            mDoc.add(new Field("queryCodeSkip", new String(ch, start, length), Field.Store.YES, Index.NO));
            mQueryCodeSkip = false;
        } else if (mRMGroupJaOn) {
            mValueRmGroupJaOn.put(new String(ch, start, length));
            mRMGroupJaOn = false;
        } else if (mRMGroupJaKun) {
            mValueRmGroupJaKun.put(new String(ch, start, length));
            mRMGroupJaKun = false;
        } else if (mNanori) {
            mValueNanori.put(new String(ch, start, length));
            mNanori = false;
        } else if (mMeaningEnglish) {
            mValueMeaningEnglish.put(new String(ch, start, length));
            mMeaningEnglish = false;
        } else if (mMeaningFrench) {
            mValueMeaningFrench.put(new String(ch, start, length));
            mMeaningFrench = false;
        } else if (mMeaningDutch) {
            mValueMeaningDutch.put(new String(ch, start, length));
            mMeaningDutch = false;
        } else if (mMeaningGerman) {
            mValueMeaningGerman.put(new String(ch, start, length));
            mMeaningGerman = false;
        }
        super.characters(ch, start, length);
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        if ("character".equals(qName)) {
            try {
                if (mValueDicRef.length() > 0) {
                    mDoc.add(new Field("dicRef", mValueDicRef.toString(), Field.Store.YES, Index.NO));
                }
                if (mValueRmGroupJaOn.length() > 0) {
                    mDoc.add(new Field("rmGroupJaOn", mValueRmGroupJaOn.toString(), Field.Store.YES, Index.NO));
                }
                if (mValueRmGroupJaKun.length() > 0) {
                    mDoc.add(new Field("rmGroupJaKun", mValueRmGroupJaKun.toString(), Field.Store.YES, Index.NO));
                }
                if (mValueMeaningEnglish.length() > 0) {
                    mDoc.add(new Field("meaningEnglish", mValueMeaningEnglish.toString(), Field.Store.YES,
                            Index.NO));
                }
                if (mValueMeaningFrench.length() > 0) {
                    mDoc.add(new Field("meaningFrench", mValueMeaningFrench.toString(), Field.Store.YES, Index.NO));
                }
                /*
                 *  dutch and german aren't in current kanjidict 2
                 */
                if (mValueMeaningDutch.length() > 0) {
                    mDoc.add(new Field("meaningDutch", mValueMeaningDutch.toString(), Field.Store.YES, Index.NO));
                }
                if (mValueMeaningGerman.length() > 0) {
                    mDoc.add(new Field("meaningGerman", mValueMeaningGerman.toString(), Field.Store.YES, Index.NO));
                }
                if (mValueNanori.length() > 0) {
                    mDoc.add(new Field("nanori", mValueNanori.toString(), Field.Store.YES, Index.NO));
                }
            } catch (JSONException ex) {
                log.error("end tag entry JSON windows", ex);
            }
            try {
                mCountDone++;
                mWriter.addDocument(mDoc);
                int persPub = Math.round((((float) mCountDone / ENTRIES_COUNT) * 100));

                if (mPerc < persPub) {
                    if (mPercSave + 4 < persPub) {
                        mWriter.commit();
                        log.debug(LOG_TAG + "SaxDataHolder progress saved - " + persPub + " %");
                        mPercSave = persPub;
                    }
                    mPerc = persPub;
                }
            } catch (CorruptIndexException e) {
                log.debug(LOG_TAG + "Saving doc - Adding document to lucene indexer failed: " + e.toString());
            } catch (IOException e) {
                log.debug(LOG_TAG + "Saving doc: Unknown exception: " + e.toString());
            }
            mDoc = null;
        }

        super.endElement(uri, localName, qName);
    }

    @Override
    public void endDocument() {
        log.debug(LOG_TAG + "End of document");
        try {
            mWriter.close();
        } catch (IOException e) {
            log.debug(LOG_TAG + "End of document - closinf lucene writer failed", e);
        }
    }

    /**
     * Verifies whether given stringis number.
     * @param parse strimng to be parsed as number
     * @return original string or null if it isn't number
     */
    private String tryParseNumber(String parse) {
        if (parse == null) {
            return null;
        }
        try {
            int number;
            number = Integer.parseInt(parse);
            if (number != 0) {
                return String.valueOf(number);
            }
        } catch (NumberFormatException ex) {
            log.debug(LOG_TAG + "Parsing number - NumberFormatException: " + parse, ex);
        }
        return null;
    }
}