com.cloudera.knittingboar.conf.cmdline.DataConverterCmdLineDriver.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.knittingboar.conf.cmdline.DataConverterCmdLineDriver.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.knittingboar.conf.cmdline;

import java.io.IOException;
import java.io.PrintWriter;

import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.util.HelpFormatter;

import com.cloudera.knittingboar.utils.DatasetConverter;

public class DataConverterCmdLineDriver {

    private static String strInputFile;
    private static String strOutputFile;
    private static String strrecordsPerBlock;

    public static void main(String[] args) throws Exception {
        mainToOutput(args, new PrintWriter(System.out, true));
    }

    static void mainToOutput(String[] args, PrintWriter output) throws Exception {
        if (!parseArgs(args)) {

            output.write("Parse:Incorrect");
            return;
        } // if

        output.write("Parse:correct");

        int shard_rec_count = Integer.parseInt(strrecordsPerBlock);

        System.out.println("Converting ");
        System.out.println("From: " + strInputFile);
        System.out.println("To: " + strOutputFile);
        System.out.println("File shard size (record count/file): " + shard_rec_count);

        int count = DatasetConverter.ConvertNewsgroupsFromSingleFiles(strInputFile, strOutputFile, shard_rec_count);

        output.write("Total Records Converted: " + count);

    } // mainToOutput

    private static boolean parseArgs(String[] args) throws IOException {
        DefaultOptionBuilder builder = new DefaultOptionBuilder();

        Option help = builder.withLongName("help").withDescription("print this list").create();

        // Option quiet =
        // builder.withLongName("quiet").withDescription("be extra quiet").create();
        // Option scores =
        // builder.withLongName("scores").withDescription("output score diagnostics during training").create();

        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        Option inputFileOption = builder.withLongName("input").withRequired(true)
                .withArgument(argumentBuilder.withName("input").withMaximum(1).create())
                .withDescription("where to get input data").create();

        Option outputFileOption = builder.withLongName("output").withRequired(true)
                .withArgument(argumentBuilder.withName("output").withMaximum(1).create())
                .withDescription("where to write output data").create();

        Option recordsPerBlockOption = builder.withLongName("recordsPerBlock")
                .withArgument(
                        argumentBuilder.withName("recordsPerBlock").withDefault("20000").withMaximum(1).create())
                .withDescription("the number of records per output file shard to write").create();

        // optionally can be { 20Newsgroups, rcv1 }
        Option RecordFactoryType = builder
                .withLongName("datasetType").withArgument(argumentBuilder.withName("recordFactoryType")
                        .withDefault("20Newsgroups").withMaximum(1).create())
                .withDescription("the type of dataset to convert").create();

        /*
         * Option passes = builder.withLongName("passes") .withArgument(
         * argumentBuilder.withName("passes") .withDefault("2")
         * .withMaximum(1).create())
         * .withDescription("the number of times to pass over the input data")
         * .create();
         * 
         * Option lambda = builder.withLongName("lambda")
         * .withArgument(argumentBuilder
         * .withName("lambda").withDefault("1e-4").withMaximum(1).create())
         * .withDescription("the amount of coefficient decay to use") .create();
         * 
         * Option rate = builder.withLongName("rate")
         * .withArgument(argumentBuilder.withName
         * ("learningRate").withDefault("1e-3").withMaximum(1).create())
         * .withDescription("the learning rate") .create();
         * 
         * Option noBias = builder.withLongName("noBias")
         * .withDescription("don't include a bias term") .create();
         */

        Group normalArgs = new GroupBuilder().withOption(help).withOption(inputFileOption)
                .withOption(outputFileOption).withOption(recordsPerBlockOption).withOption(RecordFactoryType)
                .create();

        Parser parser = new Parser();
        parser.setHelpOption(help);
        parser.setHelpTrigger("--help");
        parser.setGroup(normalArgs);
        parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 130));
        CommandLine cmdLine = parser.parseAndHelp(args);

        if (cmdLine == null) {

            System.out.println("null!");
            return false;
        }

        // "/Users/jpatterson/Downloads/datasets/20news-bydate/20news-bydate-train/"
        strInputFile = getStringArgument(cmdLine, inputFileOption);

        // "/Users/jpatterson/Downloads/datasets/20news-kboar/train4/"
        strOutputFile = getStringArgument(cmdLine, outputFileOption);

        strrecordsPerBlock = getStringArgument(cmdLine, recordsPerBlockOption);

        return true;
    }

    private static boolean getBooleanArgument(CommandLine cmdLine, Option option) {
        return cmdLine.hasOption(option);
    }

    private static String getStringArgument(CommandLine cmdLine, Option inputFile) {
        return (String) cmdLine.getValue(inputFile);
    }

}