org.apache.mahout.classifier.BayesFileFormatter.java Source code

Introduction

Here is the source code for org.apache.mahout.classifier.BayesFileFormatter.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.List;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Flatten a file into format that can be read by the Bayes M/R job.
 * <p/>
 * One document per line, first token is the label followed by a tab, rest of the line are the terms.
 */
public final class BayesFileFormatter {

    private static final Logger log = LoggerFactory.getLogger(BayesFileFormatter.class);

    private BayesFileFormatter() {
    }

    /**
     * Collapse all the files in the inputDir into a single file in the proper Bayes format, 1 document per line
     * 
     * @param label
     *          The label
     * @param analyzer
     *          The analyzer to use
     * @param inputDir
     *          The input Directory
     * @param charset
     *          The charset of the input files
     * @param outputFile
     *          The file to collapse to
     */
    public static void collapse(String label, Analyzer analyzer, File inputDir, Charset charset, File outputFile)
            throws IOException {
        Writer writer = Files.newWriter(outputFile, charset);
        try {
            inputDir.listFiles(new FileProcessor(label, analyzer, charset, writer));
            // listFiles() is called here as a way to recursively visit files,
            // actually
        } finally {
            Closeables.closeQuietly(writer);
        }
    }

    /**
     * Write the input files to the outdir, one output file per input file
     * 
     * @param label
     *          The label of the file
     * @param analyzer
     *          The analyzer to use
     * @param input
     *          The input file or directory. May not be null
     * @param charset
     *          The Character set of the input files
     * @param outDir
     *          The output directory. Files will be written there with the same name as the input file
     */
    public static void format(String label, Analyzer analyzer, File input, Charset charset, File outDir)
            throws IOException {
        if (input.isDirectory()) {
            input.listFiles(new FileProcessor(label, analyzer, charset, outDir));
        } else {
            Writer writer = Files.newWriter(new File(outDir, input.getName()), charset);
            try {
                writeFile(label, analyzer, input, charset, writer);
            } finally {
                Closeables.closeQuietly(writer);
            }
        }
    }

    /**
     * Hack the FileFilter mechanism so that we don't get stuck on large directories and 
     * don't have to loop the list twice
     */
    private static final class FileProcessor implements FileFilter {
        private final String label;

        private final Analyzer analyzer;

        private File outputDir;

        private final Charset charset;

        private Writer writer;

        /**
         * Use this when you want to collapse all files to a single file
         * 
         * @param label
         *          The label
         * @param writer
         *          must not be null and will not be closed
         */
        private FileProcessor(String label, Analyzer analyzer, Charset charset, Writer writer) {
            this.label = label;
            this.analyzer = analyzer;
            this.charset = charset;
            this.writer = writer;
        }

        /**
         * Use this when you want a writer per file
         * 
         * @param outputDir
         *          must not be null.
         */
        private FileProcessor(String label, Analyzer analyzer, Charset charset, File outputDir) {
            this.label = label;
            this.analyzer = analyzer;
            this.charset = charset;
            this.outputDir = outputDir;
        }

        @Override
        public boolean accept(File file) {
            if (file.isFile()) {
                Writer theWriter = null;
                try {
                    if (writer == null) {
                        theWriter = Files.newWriter(new File(outputDir, file.getName()), charset);
                    } else {
                        theWriter = writer;
                    }
                    writeFile(label, analyzer, file, charset, theWriter);
                    if (writer != null) {
                        // just write a new line
                        theWriter.write('\n');
                    }
                } catch (IOException e) {
                    // TODO: report failed files instead of throwing exception
                    throw new IllegalStateException(e);
                } finally {
                    if (writer == null) {
                        Closeables.closeQuietly(theWriter);
                    }
                }
            } else {
                file.listFiles(this);
            }
            return false;
        }
    }

    /**
     * Write the tokens and the label from the Reader to the writer
     * 
     * @param label
     *          The label
     * @param analyzer
     *          The analyzer to use
     * @param inFile
     *          the file to read and whose contents are passed to the analyzer
     * @param charset
     *          character encoding to assume when reading the input file
     * @param writer
     *          The Writer, is not closed by this method
     * @throws java.io.IOException
     *           if there was a problem w/ the reader
     */
    private static void writeFile(String label, Analyzer analyzer, File inFile, Charset charset, Writer writer)
            throws IOException {
        Reader reader = Files.newReader(inFile, charset);
        try {
            TokenStream ts = analyzer.reusableTokenStream(label, reader);
            writer.write(label);
            writer.write('\t'); // edit: Inorder to match Hadoop standard
            // TextInputFormat
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                char[] termBuffer = termAtt.buffer();
                int termLen = termAtt.length();
                writer.write(termBuffer, 0, termLen);
                writer.write(' ');
            }
        } finally {
            Closeables.closeQuietly(reader);
        }
    }

    /**
     * Convert a Reader to a vector
     * 
     * @param analyzer
     *          The Analyzer to use
     * @param reader
     *          The reader to feed to the Analyzer
     * @return An array of unique tokens
     */
    public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
        TokenStream ts = analyzer.reusableTokenStream("", reader);

        List<String> coll = Lists.newArrayList();
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.buffer();
            int termLen = termAtt.length();
            String val = new String(termBuffer, 0, termLen);
            coll.add(val);
        }
        return coll.toArray(new String[coll.size()]);
    }

    /**
     * Run the FileFormatter
     * 
     * @param args
     *          The input args. Run with -h to see the help
     * @throws ClassNotFoundException
     *           if the Analyzer can't be found
     * @throws IllegalAccessException
     *           if the Analyzer can't be constructed
     * @throws InstantiationException
     *           if the Analyzer can't be constructed
     * @throws IOException
     *           if the files can't be dealt with properly
     */
    public static void main(String[] args) throws Exception {
        DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
        ArgumentBuilder abuilder = new ArgumentBuilder();
        GroupBuilder gbuilder = new GroupBuilder();

        Option inputOpt = DefaultOptionCreator.inputOption().create();

        Option outputOpt = DefaultOptionCreator.outputOption().create();

        Option labelOpt = obuilder.withLongName("label").withRequired(true)
                .withArgument(abuilder.withName("label").withMinimum(1).withMaximum(1).create())
                .withDescription("The label of the file").withShortName("l").create();

        Option analyzerOpt = obuilder.withLongName("analyzer")
                .withArgument(abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
                .withDescription("The fully qualified class name of the analyzer to use. "
                        + "Must have a no-arg constructor.  Default is the StandardAnalyzer")
                .withShortName("a").create();

        Option charsetOpt = obuilder.withLongName("charset")
                .withArgument(abuilder.withName("charset").withMinimum(1).withMaximum(1).create())
                .withDescription("The character encoding of the input file").withShortName("c").create();

        Option collapseOpt = obuilder.withLongName("collapse").withRequired(true)
                .withArgument(abuilder.withName("collapse").withMinimum(1).withMaximum(1).create())
                .withDescription("Collapse a whole directory to a single file, one doc per line").withShortName("p")
                .create();

        Option helpOpt = DefaultOptionCreator.helpOption();
        Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(labelOpt)
                .withOption(analyzerOpt).withOption(charsetOpt).withOption(collapseOpt).withOption(helpOpt)
                .create();
        try {
            Parser parser = new Parser();
            parser.setGroup(group);
            CommandLine cmdLine = parser.parse(args);

            if (cmdLine.hasOption(helpOpt)) {

                return;
            }
            File input = new File((String) cmdLine.getValue(inputOpt));
            File output = new File((String) cmdLine.getValue(outputOpt));
            String label = (String) cmdLine.getValue(labelOpt);
            Analyzer analyzer;
            if (cmdLine.hasOption(analyzerOpt)) {
                analyzer = ClassUtils.instantiateAs((String) cmdLine.getValue(analyzerOpt), Analyzer.class);
            } else {
                analyzer = new StandardAnalyzer(Version.LUCENE_31);
            }
            Charset charset = Charsets.UTF_8;
            if (cmdLine.hasOption(charsetOpt)) {
                charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
            }
            boolean collapse = cmdLine.hasOption(collapseOpt);

            if (collapse) {
                collapse(label, analyzer, input, charset, output);
            } else {
                format(label, analyzer, input, charset, output);
            }

        } catch (OptionException e) {
            log.error("Exception", e);
        }
    }
}