RunNGram.java :  » Natural-Language-Processing » MorphAdorner » de » spieleck » app » cngram » Java Open Source

Java Open Source » Natural Language Processing » MorphAdorner 
MorphAdorner » de » spieleck » app » cngram » RunNGram.java
package de.spieleck.app.cngram ;

/*  Please see the license information in the header below. */

/*
 NGramJ - n-gram based text classification
 Copyright (C) 2001- Frank S. Nestel (frank at spieleck.de)

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published
 by the Free Software Foundation; either version 2.1 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU Lesser General Public License
 along with this program (lesser.txt); if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

import java.io.File ;
import java.io.FileInputStream ;
import java.io.FileReader ;
import java.io.BufferedInputStream ;
import java.io.BufferedReader ;
import java.io.Reader ;
import java.io.FileOutputStream ;
import java.io.InputStreamReader ;
import java.io.IOException ;
import java.io.PrintStream ;
import java.text.DecimalFormat ;

import edu.northwestern.at.utils.UnicodeReader;

/**
 * Commandline interface that runs a ngram analysis over submitted text,
 * results can be used for automatic language identification.
 *
 * @author Frank S. Nestel
 * @author $Author: nestefan $
 * @version $Revision: 2 $ $Date: 2006-03-27 23:00:21 +0200 (Mo, 27 Mrz 2006) $ $Author: nestefan $
 */
public class RunNGram
{
    public static final int CREATE = 1 ;
    public static final int SIMILARITY = 2 ;
    public static final int SCORE = 3 ;
    public static final int LANG = 4 ;
    public static final int TEST = 5 ;
    public static final int LANG2 = 6 ;
    public static final int LANG2B = 7 ;
    public static final int CHECK = 8 ;
    public static final int PROFILES = 9 ;

    public final static DecimalFormat DF = new DecimalFormat( "0.000" ) ;
    public final static DecimalFormat DFE = new DecimalFormat( "0.0E0" ) ;

    private static void usage( PrintStream out )
    {
        out.println( "Usage: RunNGram commandset" ) ;
        out.println(
            "          [-create profilename(out) textfile [encoding]]" ) ;
        out.println(
            "   or     [-similarity metricName textfile1 textfile2 [encoding]]" ) ;
        out.println(
            "   or     [-score metricName profile-name textfile [encoding]]" ) ;
        out.println( "   or     [-lang metricName textfile [encoding]]" ) ;
        out.println( "   or     [-test ]" ) ;
        out.println( "   or     [-lang2 textfile [encoding]]" ) ;
        out.println( "   or     [-lang2b textfile [encoding]]" ) ;
        out.println( "   or     [-check textlistFile]" ) ;
        out.println( "   or     [-profiles metricName profile1 profile2]" ) ;
        System.exit( 42 ) ;
    }

    public static void main( String args[] )
        throws Exception
    {
        int command = 0 ;

        if ( args.length == 0 )
        {
            usage( System.out ) ;
        }

        for ( int i = 0 ; i < args.length ; i++ )
        {
            String profilename = "" ;
            String profilename2 = "" ;
            String textfile = "" ;
            String filename2 = "" ;
            String metricName = null ;
            NGramMetric metric = null ;
            String encoding = "" ;

            if ( "-c".equals( args[ i ] ) || "-create".equals( args[ i ] ) )
            {
                command = CREATE ;
                profilename = args[ ++i ] ;
                textfile = args[ ++i ] ;
            }
            else if ( "-i".equals( args[ i ] ) ||
                "-similarity".equals( args[ i ] ) )
            {
                command = SIMILARITY ;
                metricName = args[ ++i ] ;
                metric =
                    (NGramMetric)Class.forName( metricName ).newInstance() ;
                textfile = args[ ++i ] ;
                filename2 = args[ ++i ] ;
            }
            else if ( "-s".equals( args[ i ] ) || args[ i ].equals( "-score" ) )
            {
                command = SCORE ;
                metricName = args[ ++i ] ;
                metric =
                    (NGramMetric)Class.forName( metricName ).newInstance() ;
                profilename = args[ ++i ] ;
                textfile = args[ ++i ] ;
            }
            else if ( "-p".equals( args[ i ] ) ||
                "-profiles".equals( args[ i ] ) )
            {
                command = PROFILES ;
                metricName = args[ ++i ] ;
                metric =
                    (NGramMetric)Class.forName( metricName ).newInstance() ;
                profilename = args[ ++i ] ;
                profilename2 = args[ ++i ] ;
            }
            else if ( "-l".equals( args[ i ] ) || "-lang".equals( args[ i ] ) )
            {
                command = LANG ;
                metricName = args[ ++i ] ;
                metric =
                    (NGramMetric)Class.forName( metricName ).newInstance() ;
                textfile = args[ ++i ] ;
            }
            else if ( "-l2".equals( args[ i ] ) ||
                "-lang2".equals( args[ i ] ) )
            {
                command = LANG2 ;
                textfile = args[ ++i ] ;
            }
            else if ( "-l2b".equals( args[ i ] ) ||
                "-lang2b".equals( args[ i ] ) )
            {
                command = LANG2B ;
                textfile = args[ ++i ] ;
            }
            else if ( "-x".equals( args[ i ] ) || "-check".equals( args[ i ] ) )
            {
                command = CHECK ;
                textfile = args[ ++i ] ;
            }
            else if ( "-t".equals( args[ i ] ) || "-test".equals( args[ i ] ) )
            {
                command = TEST ;
            }
            else
            {
                usage( System.err ) ;
            }

            if ( i + 1 < args.length && args[ i ].charAt( 0 ) != '-' )
            {
                encoding = args[ ++i ] ;
            }
            else
            {
                encoding = "iso-8859-1" ;
            }
            if ( command == TEST )
            {
                NGramProfiles npi = new NGramProfiles() ;

                npi.info() ;
            }
            else if ( command == LANG2 || command == LANG2B )
            {
                long t1 = System.currentTimeMillis() ;
                NGramProfiles nps = new NGramProfiles() ;
                NGramProfiles.Ranker ranker = nps.getRanker() ;

                ranker.account( createReader( textfile , encoding ) ) ;
                NGramProfiles.RankResult res = ranker.getRankResult() ;
                long t2 = System.currentTimeMillis() ;

                printRankResult( "speed" , res , t2 - t1 ) ;
                if ( command == LANG2B )
                {
                    t1 = t2 ;
                    ranker.reset() ;
                    ranker.account( createReader( textfile , encoding ) ) ;
                    res = ranker.getRankResult() ;
                    t2 = System.currentTimeMillis() ;
                    printRankResult( "speed" , res , t2 - t1 ) ;
                }
            }
            else if ( command == CHECK )
            {
                NGramProfiles npi = new NGramProfiles() ;
                NGramProfiles.Ranker ranker = npi.getRanker() ;
                File fi = new File( textfile ) ;
                BufferedReader br = new BufferedReader( new FileReader( fi ) ) ;
                String line ;

                while ( ( line = br.readLine() ) != null )
                {
                    line = line.trim() ;
                    if ( line.charAt( 0 ) == '#' )
                    {
                        continue ;
                    }
                    String[] ss = line.split( ";" ) ;
                    long t1 = System.currentTimeMillis() ;

                    ranker.reset() ;
                    ranker.account( createReader( ss[ 0 ] , ss[ 1 ] ) ) ;
                    long t2 = System.currentTimeMillis() ;
                    NGramProfiles.RankResult res = ranker.getRankResult() ;

                    printRankResult( ss[ 0 ] , res , t2 - t1 ) ;
                }
            }
            else if ( command == PROFILES )
            {
                FileInputStream fis ;
                File f2 = new File( profilename ) ;

                fis = new FileInputStream( f2 ) ;
                NGramProfileImpl comp1 = new NGramProfileImpl( profilename ) ;

                comp1.load( fis ) ;
                File f3 = new File( profilename2 ) ;

                fis = new FileInputStream( f3 ) ;
                NGramProfileImpl comp2 = new NGramProfileImpl( profilename2 ) ;

                comp2.load( fis ) ;
                System.out.println(
                    "diff(" + profilename + ":" + profilename2 + ")=" +
                    DFE.format( metric.diff( comp1 , comp2 ) ) ) ;
            }
            else
            {
                long t1 = System.currentTimeMillis() ;
                NGramProfileImpl newProf = create( textfile , encoding ) ;
                long t2 = System.currentTimeMillis() ;

                switch ( command )
                {

                    case CREATE :
                        String fname =
                            profilename + "." +
                            NGramProfile.NGRAM_PROFILE_EXTENSION ;
                        File f = new File( fname ) ;
                        FileOutputStream fos = new FileOutputStream( f ) ;

                        newProf.save( fos ) ;
                        System.out.println(
                            "new profile '" + fname + "' was created." ) ;
                        break ;

                    case SIMILARITY :
                        NGramProfile newProf2 = create( filename2 , encoding ) ;

                        System.out.println(
                            "Difference is " +
                                DFE.format( metric.diff( newProf , newProf2 ) ) ) ;
                        break ;

                    case SCORE :
                        File f2 =
                            new File(
                                profilename + "." +
                                NGramProfile.NGRAM_PROFILE_EXTENSION ) ;
                        FileInputStream fis = new FileInputStream( f2 ) ;
                        NGramProfileImpl compare =
                            new NGramProfileImpl( profilename ) ;

                        compare.load( fis ) ;
                        System.out.println(
                            "Score (" + profilename + ") is " +
                            DFE.format( metric.diff( compare , newProf ) ) ) ;

                        break ;

                    case LANG :
                        NGramProfiles nps = new NGramProfiles() ;
                        // Set restrict = nps.getAllNGrams();
                        long dt1 = t2 - t1 ;

                        t1 = System.currentTimeMillis() ;
                        NGramProfiles.RankResult res =
                            nps.rank( metric , newProf ) ;

                        t2 = System.currentTimeMillis() ;
                        int ppos = metricName.lastIndexOf( "." ) ;

                        printRankResult(
                            metricName.substring( ppos + 1 ) + "(" + dt1 + ")" ,
                            res , t2 - t1 ) ;
                        break ;
                }
            }
        }
    }

    public static Reader createReader( String textfile , String encoding )
        throws IOException
    {
        return new UnicodeReader(
            new BufferedInputStream( new FileInputStream( textfile ) ) ,
            encoding ) ;
    }

    public static void printRankResult( String msg , NGramProfiles.RankResult res ,
        long dt )
    {
        System.out.println(
            msg + ": " + res.getName( 0 ) + ":" +
            DF.format( res.getScore( 0 ) ) + " " + res.getName( 1 ) + ":" +
            DF.format( res.getScore( 1 ) ) + " " + res.getName( 2 ) + ":" +
            DF.format( res.getScore( 2 ) ) + " .. " + res.getName( -1 ) + ":" +
            DF.format( res.getScore( -1 ) ) + " |" +
            DFE.format( res.getScore( 1 ) / res.getScore( 0 ) ) + " |" +
            DFE.format( res.getScore( -1 ) / res.getScore( 0 ) ) + " dt=" + dt ) ;
    }

    public static NGramProfileImpl create( String textfile , String encoding )
        throws IOException
    {
        File f = new File( textfile ) ;
        FileInputStream fis = new FileInputStream( f ) ;
        NGramProfileImpl prof =
            NGramProfileImpl.createProfile( textfile , fis , encoding ) ;

        fis.close() ;
        return prof ;
    }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.